#!/bin/bash
TODAY=$(date +'%Y-%m-%d_%H%M')
#echo "wrapper.$TODAY.log"
################################################################################
# 脚本名称: system_health_check.sh
# 功能描述: 系统健康状态全面检查
# 版本信息: v2.0
# 作者: DevOps Team
# 最后修改: 2025-01-15
################################################################################
# 配置区域
HOSTNAME=$(hostname)
DATE=$(date +'%Y-%m-%dT%H:%M:%S')
REPORT_FILE="/var/log/system_check_$(date +'%Y%m%d-%H%M%S').log"
# 告警阈值配置
CPU_WARNING=80 # CPU使用率告警阈值(%)
MEM_WARNING=85 # 内存使用率告警阈值(%)
DISK_WARNING=85 # 磁盘使用率告警阈值(%)
LOAD_WARNING=4 # 系统负载告警阈值(根据CPU核心数调整)
INODE_WARNING=80 # Inode使用率告警阈值(%)
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 日志函数
log() {
echo "[$(date +"%Y-%m-%d %H:%M:%S")] $1" | tee -a "$REPORT_FILE"
}
log_section() {
echo "" | tee -a "$REPORT_FILE"
echo "============================================================" | tee -a "$REPORT_FILE"
echo " $1" | tee -a "$REPORT_FILE"
echo "============================================================" | tee -a "$REPORT_FILE"
}
log_warning() {
echo -e "${YELLOW}[WARNING] $1${NC}" | tee -a "$REPORT_FILE"
}
log_error() {
echo -e "${RED}[ERROR] $1${NC}" | tee -a "$REPORT_FILE"
}
log_ok() {
echo -e "${GREEN}[OK] $1${NC}" | tee -a "$REPORT_FILE"
}
# 1. 基本信息收集
check_basic_info() {
log_section "1. 系统基本信息"
log "主机名: $HOSTNAME"
log "检查时间: $DATE"
log "系统版本: $(cat /etc/redhat-release 2>/dev/null || cat /etc/issue | head -1 2>/dev/null || echo "未知")"
log "内核版本: $(uname -r)"
log "系统架构: $(uname -m)"
log "运行时长: $(uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}')"
log "当前用户: $(whoami)"
log "登录用户数: $(who | wc -l 2>/dev/null || echo 0)"
}
# 2. CPU使用率检查
check_cpu() {
log_section "2. CPU使用率检查"
# 获取CPU核心数
CPU_CORES=$(grep -c ^processor /proc/cpuinfo 2>/dev/null || echo 1)
log "CPU核心数: $CPU_CORES"
# 获取CPU使用率(取5秒平均值)
if command -v top &> /dev/null; then
CPU_IDLE=$(top -bn2 -d 1 2>/dev/null | grep "Cpu(s)" | tail -1 | awk '{print $8}' | cut -d'%' -f1)
CPU_USAGE=$(echo "scale=2; 100 - $CPU_IDLE" 2>/dev/null | bc 2>/dev/null || echo 0)
else
CPU_USAGE=$(awk -v a="$(awk '/cpu /{print $2+$4,$2+$4+$5}' /proc/stat; sleep 1)" '/cpu /{printf "%.2f", 100*($2+$4-a[0])/($2+$4+$5-a[1])}' /proc/stat 2>/dev/null || echo 0)
fi
log "CPU使用率: ${CPU_USAGE}%"
if [ $(echo "$CPU_USAGE > $CPU_WARNING" | bc 2>/dev/null || echo 0) -eq 1 ]; then
log_warning "CPU使用率超过${CPU_WARNING}%,当前${CPU_USAGE}%"
# 显示CPU占用最高的5个进程
log "TOP 5 CPU消耗进程:"
ps aux 2>/dev/null | sort -rn -k3 2>/dev/null | head -5 2>/dev/null | awk '{printf " PID: %-8s User: %-10s CPU: %-6s CMD: %s\n", $2,$1,$3,$11}' 2>/dev/null | tee -a "$REPORT_FILE"
else
log_ok "CPU使用率正常: ${CPU_USAGE}%"
fi
# 检查系统负载
if command -v uptime &> /dev/null; then
LOAD_1=$(uptime 2>/dev/null | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | xargs 2>/dev/null || echo 0)
LOAD_5=$(uptime 2>/dev/null | awk -F'load average:' '{print $2}' | awk -F',' '{print $2}' | xargs 2>/dev/null || echo 0)
LOAD_15=$(uptime 2>/dev/null | awk -F'load average:' '{print $2}' | awk -F',' '{print $3}' | xargs 2>/dev/null || echo 0)
log "系统负载: 1分钟=${LOAD_1}, 5分钟=${LOAD_5}, 15分钟=${LOAD_15}"
# 负载告警(1分钟负载超过CPU核心数的2倍)
LOAD_THRESHOLD=$(echo "$CPU_CORES * 2" | bc 2>/dev/null || echo 4)
if [ $(echo "$LOAD_1 > $LOAD_THRESHOLD" | bc 2>/dev/null || echo 0) -eq 1 ]; then
log_warning "系统负载过高! 1分钟负载${LOAD_1}超过阈值${LOAD_THRESHOLD}"
fi
else
log_warning "无法获取系统负载信息"
fi
}
# 3. 内存使用检查
check_memory() {
log_section "3. 内存使用检查"
# 获取内存信息(兼容不同Linux版本)
if command -v free &> /dev/null; then
MEM_TOTAL=$(free -m 2>/dev/null | awk 'NR==2{print $2}')
MEM_USED=$(free -m 2>/dev/null | awk 'NR==2{print $3}')
MEM_FREE=$(free -m 2>/dev/null | awk 'NR==2{print $4}')
MEM_AVAILABLE=$(free -m 2>/dev/null | awk 'NR==2{print $7}')
# 计算使用率
if [ "$MEM_TOTAL" -gt 0 ] 2>/dev/null; then
MEM_USAGE=$(echo "scale=2; $MEM_USED / $MEM_TOTAL * 100" 2>/dev/null | bc 2>/dev/null || echo 0)
else
MEM_USAGE=0
fi
log "内存总量: ${MEM_TOTAL}MB"
log "已用内存: ${MEM_USED}MB"
log "可用内存: ${MEM_AVAILABLE}MB"
log "内存使用率: ${MEM_USAGE}%"
if [ $(echo "$MEM_USAGE > $MEM_WARNING" | bc 2>/dev/null || echo 0) -eq 1 ]; then
log_warning "内存使用率超过${MEM_WARNING}%,当前${MEM_USAGE}%"
# 显示内存占用最高的5个进程
log "TOP 5 内存消耗进程:"
ps aux 2>/dev/null | sort -rn -k4 2>/dev/null | head -5 2>/dev/null | awk '{printf " PID: %-8s User: %-10s MEM: %-6s CMD: %s\n", $2,$1,$4,$11}' 2>/dev/null | tee -a "$REPORT_FILE"
else
log_ok "内存使用率正常: ${MEM_USAGE}%"
fi
# Swap检查
SWAP_TOTAL=$(free -m 2>/dev/null | awk 'NR==3{print $2}')
SWAP_USED=$(free -m 2>/dev/null | awk 'NR==3{print $3}')
log "Swap总量: ${SWAP_TOTAL}MB"
log "Swap使用: ${SWAP_USED}MB"
if [ "$SWAP_TOTAL" -gt 0 ] 2>/dev/null && [ "$SWAP_USED" -gt 100 ] 2>/dev/null; then
log_warning "Swap使用量较高: ${SWAP_USED}MB, 可能存在内存压力"
fi
else
log_error "无法获取内存信息,free命令不存在"
fi
}
# 4. 磁盘使用检查
check_disk() {
log_section "4. 磁盘使用检查"
# 检查各分区使用率
if command -v df &> /dev/null; then
log "磁盘分区使用情况:"
df -h 2>/dev/null | grep -vE '^Filesystem|tmpfs|cdrom|loop' 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
# 告警检查
HAS_DISK_WARNING=0
while read line; do
USAGE=$(echo "$line" 2>/dev/null | awk '{print $5}' 2>/dev/null | sed 's/%//' 2>/dev/null)
MOUNT=$(echo "$line" 2>/dev/null | awk '{print $6}' 2>/dev/null)
if [ -n "$USAGE" ] && [ "$USAGE" -gt "$DISK_WARNING" ] 2>/dev/null; then
log_warning "磁盘分区 $MOUNT 使用率${USAGE}%超过阈值${DISK_WARNING}%"
HAS_DISK_WARNING=1
# 显示该分区最大的5个目录
if [ -d "$MOUNT" ]; then
log " $MOUNT 分区占用空间最大的5个目录:"
du -sh "${MOUNT}"/* 2>/dev/null | sort -rh 2>/dev/null | head -5 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
fi
fi
done < <(df -h 2>/dev/null | grep -vE '^Filesystem|tmpfs|cdrom|loop' 2>/dev/null)
if [ $HAS_DISK_WARNING -eq 0 ]; then
log_ok "所有磁盘分区使用率正常"
fi
# Inode使用率检查
log ""
log "Inode使用情况:"
df -i 2>/dev/null | grep -vE '^Filesystem|tmpfs|cdrom|loop' 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
while read line; do
INODE_USAGE=$(echo "$line" 2>/dev/null | awk '{print $5}' 2>/dev/null | sed 's/%//' 2>/dev/null)
MOUNT=$(echo "$line" 2>/dev/null | awk '{print $6}' 2>/dev/null)
if [ -n "$INODE_USAGE" ] && [ "$INODE_USAGE" -gt "$INODE_WARNING" ] 2>/dev/null; then
log_warning "分区 $MOUNT 的Inode使用率${INODE_USAGE}%超过阈值${INODE_WARNING}%"
fi
done < <(df -i 2>/dev/null | grep -vE '^Filesystem|tmpfs|cdrom|loop' 2>/dev/null)
else
log_error "无法获取磁盘信息,df命令不存在"
fi
}
# 5. 网络状态检查
check_network() {
log_section "5. 网络状态检查"
# 网络接口状态
if command -v ip &> /dev/null; then
log "网络接口状态:"
ip -br addr 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
else
log_warning "ip命令不存在,无法获取网络接口信息"
fi
# 网络连接统计
log ""
if command -v ss &> /dev/null; then
log "TCP连接状态统计(使用ss):"
ss -ant 2>/dev/null | awk 'NR>1 {print $1}' 2>/dev/null | sort 2>/dev/null | uniq -c 2>/dev/null | sort -rn 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
# 检查TIME_WAIT过多
TIME_WAIT_COUNT=$(ss -ant 2>/dev/null | grep TIME-WAIT 2>/dev/null | wc -l 2>/dev/null)
elif command -v netstat &> /dev/null; then
log "TCP连接状态统计(使用netstat):"
netstat -an 2>/dev/null | awk '/^tcp/ {print $6}' 2>/dev/null | sort 2>/dev/null | uniq -c 2>/dev/null | sort -rn 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
# 检查TIME_WAIT过多
TIME_WAIT_COUNT=$(netstat -an 2>/dev/null | grep TIME_WAIT 2>/dev/null | wc -l 2>/dev/null)
else
log_warning "ss和netstat命令均不存在,无法获取网络连接信息"
TIME_WAIT_COUNT=0
fi
log "TIME_WAIT连接数: $TIME_WAIT_COUNT"
if [ "$TIME_WAIT_COUNT" -gt 5000 ] 2>/dev/null; then
log_warning "TIME_WAIT连接数过多: $TIME_WAIT_COUNT"
fi
# 监听端口检查
log ""
log "当前监听端口:"
if command -v ss &> /dev/null; then
ss -tuln 2>/dev/null | grep LISTEN 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
elif command -v netstat &> /dev/null; then
netstat -tuln 2>/dev/null | grep LISTEN 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
fi
}
# 6. 进程和服务检查
check_processes() {
log_section "6. 进程和服务检查"
# 总进程数
if command -v ps &> /dev/null; then
PROCESS_COUNT=$(ps aux 2>/dev/null | wc -l 2>/dev/null)
PROCESS_COUNT=$((PROCESS_COUNT - 1)) # 减去标题行
log "当前进程总数: $PROCESS_COUNT"
# 僵尸进程检查
ZOMBIE_COUNT=$(ps aux 2>/dev/null | awk '{print $8}' 2>/dev/null | grep -c Z 2>/dev/null)
log "僵尸进程数: $ZOMBIE_COUNT"
if [ "$ZOMBIE_COUNT" -gt 0 ] 2>/dev/null; then
log_warning "发现僵尸进程!"
ps aux 2>/dev/null | grep 'Z' 2>/dev/null | grep -v grep 2>/dev/null | awk '{print " PID: " $2 " PPID: " $3 " CMD: " $11}' 2>/dev/null | tee -a "$REPORT_FILE"
fi
else
log_error "无法获取进程信息,ps命令不存在"
fi
# 检查关键服务(根据实际业务调整)
log ""
log "关键服务状态检查:"
# 定义需要检查的服务列表
CRITICAL_SERVICES=("sshd" "crond" "rsyslog")
for service in "${CRITICAL_SERVICES[@]}"; do
if systemctl is-active --quiet "$service" 2>/dev/null; then
log_ok " $service: 运行中"
else
# 兼容非systemd系统
if ps aux 2>/dev/null | grep -v grep 2>/dev/null | grep -q "$service" 2>/dev/null; then
log_ok " $service: 运行中"
else
log_error " $service: 未运行"
fi
fi
done
}
# 7. 系统日志检查
check_logs() {
log_section "7. 系统日志检查"
# 检查最近的错误日志
log "最近系统错误日志检查:"
ERROR_COUNT=0
for log_file in /var/log/messages /var/log/syslog; do
if [ -f "$log_file" ]; then
CURRENT_ERRORS=$(grep -i "error\|fail\|critical" "$log_file" 2>/dev/null | tail -20 2>/dev/null | wc -l 2>/dev/null)
ERROR_COUNT=$((ERROR_COUNT + CURRENT_ERRORS))
if [ "$CURRENT_ERRORS" -gt 0 ] 2>/dev/null; then
log_warning "$log_file 发现 $CURRENT_ERRORS 条错误日志"
grep -i "error\|fail\|critical" "$log_file" 2>/dev/null | tail -10 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
fi
fi
done
if [ "$ERROR_COUNT" -eq 0 ] 2>/dev/null; then
log_ok "无严重错误日志"
fi
# OOM检查
log ""
log "OOM(内存溢出)检查:"
if command -v dmesg &> /dev/null; then
OOM_COUNT=$(dmesg 2>/dev/null | grep -i "out of memory" 2>/dev/null | wc -l 2>/dev/null)
if [ "$OOM_COUNT" -gt 0 ] 2>/dev/null; then
log_warning "发现 $OOM_COUNT 次OOM事件"
dmesg 2>/dev/null | grep -i "out of memory" 2>/dev/null | tail -5 2>/dev/null | awk '{print " " $0}' 2>/dev/null | tee -a "$REPORT_FILE"
else
log_ok "无OOM事件"
fi
else
log_warning "无法检查OOM事件,dmesg命令不存在"
fi
}
# 8. 生成巡检报告摘要
generate_summary() {
log_section "8. 巡检报告摘要"
# 统计告警和错误
WARNING_COUNT=$(grep -c "\[WARNING\]" "$REPORT_FILE" 2>/dev/null || echo 0)
ERROR_COUNT=$(grep -c "\[ERROR\]" "$REPORT_FILE" 2>/dev/null || echo 0)
log "巡检完成时间: $(date +"%Y-%m-%d %H:%M:%S")"
log "告警数量: $WARNING_COUNT"
log "错误数量: $ERROR_COUNT"
if [ "$ERROR_COUNT" -gt 0 ] 2>/dev/null; then
log_error "发现 $ERROR_COUNT 个严重问题,请立即处理!"
elif [ "$WARNING_COUNT" -gt 0 ] 2>/dev/null; then
log_warning "发现 $WARNING_COUNT 个告警,建议关注"
else
log_ok "系统状态良好,无异常"
fi
log ""
log "完整报告已保存至: $REPORT_FILE"
}
# 主函数
main() {
echo "=========================================="
echo " 服务器健康状态巡检脚本 v2.0"
echo "=========================================="
echo ""
# 检查是否为root用户
if [ "$(id -u)" -ne 0 ] 2>/dev/null; then
echo "警告: 非root用户运行,部分检查可能无法执行"
fi
# 执行所有检查
check_basic_info
check_cpu
check_memory
check_disk
check_network
check_processes
check_logs
generate_summary
echo ""
echo "=========================================="
echo " 巡检完成!"
echo "=========================================="
}
# 脚本入口
main "$@"
