#!/bin/bash # Script that continously runs "vmstat" to look to high "wa" (CPU wait state) # and mail the output of "top", "iostat" and "iotop" if it rises above a # configurable limit. # Where to send reports RECIPIENT="some-email-address@example.com" # What is considered a "high" wait load? (vmstat style, 0-100) HIGH_WA=75 # Run "vmstat" with this interval (seconds) VMSTAT_INTERVAL=2 export PATH="/bin:/sbin:/usr/bin:/usr/sbin" TMPDIR=$( mktemp -t -d "alert-on-high-cpu-wait.XXXXXXXXXX" ) mkfifo "$TMPDIR"/vmstat-fifo # Start the vmstat process vmstat $VMSTAT_INTERVAL > "$TMPDIR"/vmstat-fifo & # Try to make sure we clean up after us got_signal() { local SIGNAL=$1 echo "caught signal $SIGNAL..." >&2 kill %vmstat rm -rf "$TMPDIR" trap - EXIT } for SIGNAL in HUP INT QUIT TERM EXIT ; do eval "trapped_$SIGNAL() { got_signal $SIGNAL ; }" trap "trapped_$SIGNAL" $SIGNAL done IGNORE_LINES=0 while read LINE ; do set -- $LINE if [ "$1" = "procs" ] ; then # First header line # procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu------ HEADER1=$LINE continue elif [ "$1" = "r" ] ; then # Second header line # r b swpd free buff cache si so bi bo in cs us sy id wa st HEADER2=$LINE continue elif [ $IGNORE_LINES -gt 0 ] ; then # We're instructed to ignore some lines because of a recent alert IGNORE_LINES=$(( $IGNORE_LINES - 1 )) continue else # At last, some data! CPU_WA=${16} if [ -n "${CPU_WA//[0-9]/}" -o -z "$CPU_WA" ] ; then # Non-numeric output in "wa" column? echo "confusing output from vmstat:" >&2 echo "$HEADER1" >&2 echo "$HEADER2" >&2 echo "$LINE" >&2 else if [ $CPU_WA -gt $HIGH_WA ] ; then exec 5> "$TMPDIR"/mail-report echo "Alarm triggered, CPU wait is $CPU_WA% > $HIGH_WA%" >&5 echo >&5 echo "$HEADER1" >&5 echo "$HEADER2" >&5 echo "$LINE" >&5 echo >&5 # Run top once in batch mode, with proc command line and include dead children # Manually exclude processes with "0.0" in both %CPU and %MEM top -b -n 1 -c -S | awk '$9 != "0.0" || $10 != "0.0" { print; }' >&5 echo >&5 # Run iostat twice with a two second interval. First block is summary since # system boot, second is the last three seconds. Discard first block iostat -dx 2 2 | awk 'BEGIN { block = 0; } \ /^Device:/ { block = block + 1; if (block >= 2) { print; }; next; } \ { if (block != 1) { print; } }' >&5 echo >&5 # Run iotop (if present) twice with two second interval. Discard the first block # which is spit out immediately and seems less than precise. Manually exclude # processes with "0.00" in all four of "DISK READ", "DISK WRITE", "SWAPIN" and # "IO", since "-o" seems to be ignored (or is too sensitive) on the second run. # Beware that SWAPIN and IO can be "-0.00 %", for some strange reason. iotop -n 2 -d 3 -b -P | awk 'BEGIN { block = 0; } \ /^Total / { block = block + 1; if (block >= 2) { print; } next; } \ { if (block != 1 && ! ($4 == "0.00" && $6 == "0.00" && $8 ~ /^-?0\.00$/ && $10 ~ /^-?0\.00$/) ) { print; } }' >&5 echo >&5 echo "__end__" >&5 exec 5>&- # Email the report mail -s "Alarm triggered, CPU wait is $CPU_WA% > $HIGH_WA%" $RECIPIENT < "$TMPDIR"/mail-report # Set a "quarantine" flag to ignore the next N lines, so we don't send alarms # continously IGNORE_LINES=$(( $IGNORE_LINES + 12 )) fi fi fi done < "$TMPDIR"/vmstat-fifo