[Nagiosplug-devel] New plugin contrib : check_procl - %CPU/%MEM/CPUTIME checks
Jerome Tytgat
jtytgat at websurg.com
Thu Jul 4 23:48:02 CEST 2002
For the one which are not on the mailing list :
+-------------------------------------------- BEGIN CUT
THERE -----------------------------------------+
#!/bin/bash
#
# Check_procl.sh
#
# Program: Process load check plugin for Nagios
# License : GPL
# Copyright (c) 2002 Jerome Tytgat (j.tytgat at sioban.net)
#
# check_procl.sh,v 1.1 2002/07/04 09:35
#
# Description :
#
# This plugin is for check the %cpu, %mem or cputime of one or more process
#
# Usage :
#
# check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu
# check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
# check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
#
# check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d
| d:e:f> <--cpu | --mem | --cputime>
# check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d
| d:e:f> <--cpu | --mem | --cputime>
#
# Example :
#
# To know the memory eaten by HTTPD processes, be warned when it reach 50%
and be critical when it reach 75%
# check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
# > OK - total %MEM for process httpd : 46.1
#
# To know the process which eat the more cpu time, but as we are under
linux and are using kapm we do :
# check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
# > CRITICAL - total CPUTIME for process named : 02:32:10
#
# Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
#
# BUGS : problems with handling time on solaris...
help_usage() {
echo "Usage:"
echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
echo " [-e <process_name1,process_name2,...>] -w warning -c
critical < --cpu | --mem | --cputime>"
echo " $0 (-v | --version)"
echo " $0 (-h | --help)"
}
help_version() {
echo "check_procl.sh (nagios-plugins) 1.1"
echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may
redistribute"
echo "copies of the plugins under the terms of the GNU General
Public License."
echo "For more information about these matters, see the file named
COPYING."
echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat at sioban.net"
echo "Greetings goes to Websurg which kindly let me took time to develop
this"
echo " Manu Feig and Jacques Kern who were my beta
testers, thanks to them !"
}
verify_dep() {
needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
for i in `echo $needed`
do
type $i > /dev/null 2>&1 /dev/null
if [ $? -eq 1 ]
then
echo "I am missing an important component : $i"
echo "Cannot continue, sorry, try to find the missing one..."
exit 3
fi
done
}
myself=$0
verify_dep
if [ "$1" = "-h" -o "$1" = "--help" ]
then
help_version
echo ""
echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
echo "of a process."
echo ""
help_usage
echo ""
echo "Required Arguments:"
echo " -p, --process STRING1,STRING2,..."
echo " names of the processes we want to monitor,"
echo " you can add as much as process as you want, separated by
comma,"
echo " hey will be cumulated"
echo " -p, --process %all%"
echo " The special keyword %all% will check the cumulative
cpu/mem/time of all process"
echo " WARNING : Can be very slow on heavy loaded servers, watch your
timeout !"
echo " -p, --process %max%"
echo " The special keyword %max% will check the process which eat
the most"
echo " WARNING : only select the process which eat the more, not the
cumulative,"
echo " but return the cumulative"
echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
echo " generate warning state if process count is outside this range"
echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
echo " generate critical state if process count is outside this range"
echo " --cpu"
echo " return the current cpu usage for the given process"
echo " --mem"
echo " return the current memory usage for the given process"
echo " --cputime"
echo " return the total cputime usage for the given process"
echo ""
echo "Optional Argument:"
echo " -e, --exclude-process STRING1,STRING2,..."
echo " names of the processes we want don't want to monitor"
echo " only useful when associated with %all% or %max% keywords,
else ignored"
echo " ex : kapm-idled on linux is a process which eat memory /
cputime but not really... ;-)"
echo ""
exit 3
fi
if [ "$1" = "-v" -o "$1" = "--version" ]
then
help_version
exit 3
fi
if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
then
echo "Bad arguments number (need at least 7)!"
help_usage
exit 3
fi
tt=0
process_name=""
exclude_process_name=""
wt=""
ct=""
# Test of the command lines arguments
while test $# -gt 0
do
case "$1" in
-p|--process)
if [ -n "$process_name" ]
then
echo "Only one --process argument is useful..."
help_usage
exit 3
fi
shift
process_name="`echo $1|tr \",\" \"|\"`"
;;
-e|--exclude-process)
if [ -n "$exclude_process_name" ]
then
echo "Only one --exclude-process argument is
useful..."
help_usage
exit 3
fi
shift
exclude_process_name="`echo $1|tr \",\" \"|\"`"
;;
-w|--warning)
if [ -n "$wt" ]
then
echo "Only one --warning argument needed...
Trying to test bad things ? :-)"
help_usage
exit 3
fi
shift
wt=$1
;;
-c|--critical)
if [ -n "$ct" ]
then
echo "Only one --critical argument needed...
Trying to test bad things ? :-)"
help_usage
exit 3
fi
shift
ct=$1
;;
--cpu)
if [ $tt -eq 0 ]
then
tt=1
else
echo "Only one of the
arguments --cpu/--mem/--cputime can be used at a time !"
help_usage
exit 3
fi
type_arg_aff="%CPU"
type_arg="pcpu"
delim="."
;;
--mem)
if [ $tt -eq 0 ]
then
tt=2
else
echo "Only one of the
arguments --cpu/--mem/--cputime can be used at a time !"
help_usage
exit 3
fi
type_arg_aff="%MEM"
type_arg="pmem"
delim="."
;;
--cputime)
if [ $tt -eq 0 ]
then
tt=3
else
echo "Only one of the
arguments --cpu/--mem/--cputime can be used at a time !"
help_usage
exit 3
fi
type_arg_aff="TIME"
type_arg="time"
delim=":"
;;
*)
echo "Unknown argument $1"
help_usage
exit 3
;;
esac
shift
done
# Is the process running ?
if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" !=
"%all%" -a "$process_name" != "%max%" ]
then
echo "WARNING: process $process_name not running !"
exit 3
fi
# Cut of warning and critical values
wt_value1=`echo $wt|cut -d"$delim" -f1`
wt_value2=`echo $wt|cut -d"$delim" -f2`
ct_value1=`echo $ct|cut -d"$delim" -f1`
ct_value2=`echo $ct|cut -d"$delim" -f2`
if [ $tt -eq 3 ]
then
wt_value3=`echo $wt|cut -d"$delim" -f3`
ct_value3=`echo $ct|cut -d"$delim" -f3`
else
wt_value3=0
ct_value3=0
fi
# Integrity check of warning and critical values
if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
then
echo "Bad expression in the WARNING field : $wt"
help_usage
exit 3
fi
if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo
$wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d
\"[:digit:]\"`" != "" ]
then
echo "Bad expression in the WARNING field : $wt"
help_usage
exit 3
fi
if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
then
echo "Bad expression in the CRITICAL field : $ct"
help_usage
exit 3
fi
if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo
$ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d
\"[:digit:]\"`" != "" ]
then
echo "Bad expression in the CRITICAL field : $ct"
help_usage
exit 3
fi
# ps line construction set...
case "$process_name" in
%all%)
if [ -z "$exclude_process_name" ]
then
psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^
*\([0-9]\)/\1/"|cut -d" " -f1`
else
psline=`ps -eo $type_arg,comm|egrep -v
"$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^
*\([0-9]\)/\1/"|cut -d" " -f1`
fi
;;
%max%)
if [ -z "$exclude_process_name" ]
then
pstmp=`ps -eo $type_arg,comm|egrep -v
"$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d"
" -f2`
else
pstmp=`ps -eo $type_arg,comm|egrep -v
"$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^
*\([0-9]\)/\1/"|cut -d" " -f2`
fi
psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d"
" -f1`
process_name=$pstmp
;;
*)
psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^
*\([0-9]\)/\1/"|cut -d" " -f1`
;;
esac
total1=0
total2=0
total3=0
# fetching the values
for i in $psline
do
# Special case for solaris - several format exist for the time function...
if [ ${#i} -le 6 -a "$tt" -eq 3 ]
then
i="00:$i"
fi
value1=`echo $i|cut -d$delim -f1`
value2=`echo $i|cut -d$delim -f2`
value3=`echo $i|cut -d$delim -f3`
value3=`test -z "$value3" && echo 0 || echo $value3`
total1=`expr $total1 + $value1`
total2=`expr $total2 + $value2`
total3=`expr $total3 + $value3`
if [ $tt -eq 3 ]
then
if [ $total3 -ge 60 ]
then
let total2+=1
let total3-=60
fi
if [ $total2 -ge 60 ]
then
let total1+=1
let total2-=60
fi
else
if [ $total2 -ge 10 ]
then
let total1+=1
let total2=total2-10
fi
fi
done
warn=0
crit=0
# evaluation of the cumulative values vs warning and critical values
case "$tt" in
1)
return_total="$total1.$total2"
test $total1 -gt $ct_value1 && crit=1
test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
test $total1 -gt $wt_value1 && warn=1
test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
;;
2)
return_total="$total1.$total2"
test $total1 -gt $ct_value1 && crit=1
test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 &&
crit=1
test $total1 -gt $wt_value1 && warn=1
test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 &&
warn=1
;;
3)
return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test
${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
test $total1 -gt $ct_value1 && crit=1
test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 &&
crit=1
test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a
$total3 -ge $ct_value3 && crit=1
test $total1 -gt $wt_value1 && warn=1
test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 &&
warn=1
test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a
$total3 -ge $wt_value3 && warn=1
;;
esac
# last check ...
if [ $crit -eq 1 -a $warn -eq 0 ]
then
echo "Critical value must be greater than warning value !"
help_usage
exit 3
fi
# Finally Inform Nagios of what we found...
if [ $crit -eq 1 ]
then
echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr
\"|\" \",\"` : $return_total"
exit 2
elif [ $warn -eq 1 ]
then
echo "WARNING - total $type_arg_aff for process `echo $process_name|tr
\"|\" \",\"` : $return_total"
exit 1
else
echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\"
\",\"` : $return_total"
exit 0
fi
# Hey what are we doing here ???
exit 3
+-------------------------------------------- END CUT
THERE -----------------------------------------+
Jérôme Tytgat
Network security Engineer
@ Websurg.com
mailto:j.tytgat at sioban.net / mailto:jtytgat at websurg.com
More information about the Devel
mailing list