In Linux operations, CPU usage reaching 100% is a common but serious issue that severely impacts system performance. This can cause slow service responses or even complete unresponsiveness, directly impacting business operations. How can we automatically diagnose and address high CPU usage issues on Linux servers using traditional Chinese medicine?
When a server's CPU usage consistently reaches 100%, the first step is to identify whether a userspace process or the kernel is consuming significant CPU resources. Common causes include abnormal processes, dead loops, resource contention, improper configuration, or application bugs. Manual diagnosis typically requires executing a series of commands: using top or htop to view overall CPU usage and identify high-CPU processes; using the ps command to further analyze process details; using strace to trace process system calls; or performing performance analysis using perf. However, in critical situations, manually executing these steps is time-consuming and error-prone.
To this end, we have developed an automated diagnostic script that systematically collects critical information and takes appropriate action. The following is the complete script code:
#!/bin/bash
# 定义输出颜色
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# 日志文件设置
LOG_DIR="/tmp/cpu_analysis"
mkdir -p $LOG_DIR
LOG_FILE="$LOG_DIR/cpu_analysis_$(date +%Y%m%d_%H%M%S).log"
# 记录日志函数
log() {
echo -e "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a $LOG_FILE
}
# 检查命令是否存在
check_command() {
if ! command -v $1 &> /dev/null; then
log "${RED}错误: $1 命令未找到,请安装后再运行脚本${NC}"
exit 1
fi
}
# 系统概览
system_overview() {
log "${GREEN}收集系统概览信息...${NC}"
echo "CPU使用率TOP 10进程:" >> $LOG_FILE
ps aux --sort=-%cpu | head -11 >> $LOG_FILE
echo "" >> $LOG_FILE
echo "内存使用情况:" >> $LOG_FILE
free -h >> $LOG_FILE
echo "" >> $LOG_FILE
echo "系统运行时间和平均负载:" >> $LOG_FILE
uptime >> $LOG_FILE
echo "" >> $LOG_FILE
}
# 识别高CPU占用进程
identify_high_cpu_processes() {
log "${GREEN}识别高CPU占用进程...${NC}"
local threshold=50
local high_cpu_processes=$(ps aux --sort=-%cpu | awk -v threshold="$threshold" 'NR>1 && $3 > threshold {print $2, $3, $11}')
if [ -z "$high_cpu_processes" ]; then
log "${YELLY}未发现CPU占用超过${threshold}%的进程${NC}"
return 1
else
echo "高CPU占用进程(PID, CPU%, 命令):" >> $LOG_FILE
echo "$high_cpu_processes" >> $LOG_FILE
echo "" >> $LOG_FILE
return 0
fi
}
# 详细进程分析
analyze_process() {
local pid=$1
log "${GREEN}分析进程 $pid ...${NC}"
echo "进程详细信息:" >> $LOG_FILE
ps -p $pid -o pid,ppid,user,%cpu,%mem,vsz,rss,tty,stat,start,time,command >> $LOG_FILE
echo "" >> $LOG_FILE
echo "进程命令行参数:" >> $LOG_FILE
cat /proc/$pid/cmdline 2>/dev/null | xargs -0 echo >> $LOG_FILE
echo "" >> $LOG_FILE
echo "进程打开的文件:" >> $LOG_FILE
lsof -p $pid 2>/dev/null | head -20 >> $LOG_FILE
echo "" >> $LOG_FILE
}
# 收集系统状态信息
collect_system_stats() {
log "${GREEN}收集系统状态信息...${NC}"
echo "CPU核心数:" >> $LOG_FILE
nproc >> $LOG_FILE
echo "" >> $FILE
echo "每个CPU核心的使用情况:" >> $LOG_FILE
mpstat -P ALL 1 1 >> $LOG_FILE
echo "" >> $LOG_FILE
echo "内核调用统计:" >> $LOG_FILE
sar -n DEV 1 1 >> $LOG_FILE
echo "" >> $LOG_FILE
echo "磁盘I/O统计:" >> $LOG_FILE
iostat -x 1 1 >> $LOG_FILE
echo "" >> $LOG_FILE
echo "内存统计:" >> $LOG_FILE
vmstat 1 2 >> $LOG_FILE
echo "" >> $LOG_FILE
}
# 收集Java进程信息(如果适用)
analyze_java_process() {
local pid=$1
if ps -p $pid -o comm= | grep -q java; then
log "${GREEN}检测到Java进程,收集JVM信息...${NC}"
# 检查jstack是否可用
if command -v jstack &> /dev/null; then
echo "Java线程栈跟踪:" >> $LOG_FILE
jstack $pid >> $LOG_FILE 2>&1
echo "" >> $LOG_FILE
else
echo "jstack不可用,无法收集Java线程信息" >> $LOG_FILE
fi
fi
}
# 生成分析报告
generate_report() {
log "${GREEN}生成分析报告...${NC}"
echo "CPU 100%问题分析报告" >> $LOG_FILE
echo "生成时间: $(date)" >> $LOG_FILE
echo "==========================================" >> $LOG_FILE
}
# 安全结束异常进程(需要谨慎使用)
safe_kill_process() {
local pid=$1
local process_name=$(ps -p $pid -o comm= 2>/dev/null)
if [ -z "$process_name" ]; then
log "${RED}进程 $pid 不存在${NC}"
return
fi
log "${YELLOW}尝试安全结束进程 $pid ($process_name)...${NC}"
# 首先尝试正常终止
kill $pid 2>/dev/null
sleep 2
# 检查进程是否仍在运行
if ps -p $pid > /dev/null; then
log "${YELLOW}正常终止失败,尝试强制终止...${NC}"
kill -9 $pid 2>/dev/null
if [ $? -eq 0 ]; then
log "${GREEN}已强制终止进程 $pid${NC}"
else
log "${RED}无法终止进程 $pid${NC}"
fi
else
log "${GREEN}已正常终止进程 $pid${NC}"
fi
}
# 主函数
main() {
log "${GREEN}开始CPU 100%问题诊断...${NC}"
# 检查必要命令
check_command "ps"
check_command "top"
check_command "mpstat"
check_command "iostat"
check_command "vmstat"
check_command "lsof"
# 生成报告
generate_report
# 收集系统信息
system_overview
collect_system_stats
# 识别高CPU进程
if identify_high_cpu_processes; then
echo "$high_cpu_processes" | while read line; do
pid=$(echo $line | awk '{print $1}')
cpu_usage=$(echo $line | awk '{print $2}')
command=$(echo $line | awk '{print $3}')
log "${RED}发现高CPU占用进程: PID=$pid, CPU使用率=$cpu_usage%, 命令=$command${NC}"
# 分析进程
analyze_process $pid
# 如果是Java进程,额外收集信息
analyze_java_process $pid
# 询问是否终止进程
read -p "是否终止此进程? (y/N): " answer
case $answer in
[Yy]*)
safe_kill_process $pid
;;
*)
log "${YELLOW}跳过终止进程 $pid${NC}"
;;
esac
done
fi
log "${GREEN}诊断完成。详细日志请查看: $LOG_FILE${NC}"
log "${GREEN}建议根据日志分析结果进一步调整系统配置或应用程序${NC}"
}
# 执行主函数
main "$@"
This script is designed to provide a step-by-step, in-depth diagnosis of CPU issues. First, the script collects overall system status information, including CPU, memory, and load, to provide context for the problem. It then identifies the processes consuming the most CPU resources and obtains detailed information about them, including startup parameters, open files, and resource usage.
For Java applications, the script specifically adds JVM information collection capabilities, using the jstack command to obtain thread stack traces. This is particularly useful for diagnosing dead loops or thread blocking issues in Java applications. The script also provides an interactive process termination function, but this requires administrator confirmation to avoid accidentally killing important processes.
After the script completes, a detailed log file is generated in the /tmp/cpu_analysis directory, containing all collected system information and process data. Administrators can analyze this information to identify the root cause of high CPU usage, such as application bugs, misconfiguration, resource contention, or insufficient hardware.
It should be noted that the script is only a diagnostic tool; it can help quickly identify problems, but fundamental resolution often requires further application optimization, system parameter adjustments, or hardware resource additions based on the diagnostic results. Regularly running system health checks, implementing monitoring and alerting mechanisms, and establishing performance baselines are all effective measures to prevent 100% CPU usage.