Add check: check_cpu (from Aurélien Grimal)
This commit is contained in:
parent
c0242386a1
commit
f81bae86bb
275
conf/usr/local/bin/check_cpu
Executable file
275
conf/usr/local/bin/check_cpu
Executable file
@ -0,0 +1,275 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
# Copyright © 2019 Aurélien Grimal - aurelien.grimal@tech-tips.fr
|
||||||
|
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# any later version.
|
||||||
|
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU General Public License for more details.
|
||||||
|
|
||||||
|
# You should have received a copy of the GNU General Public License
|
||||||
|
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
#####
|
||||||
|
|
||||||
|
# Usage examples :
|
||||||
|
# 1) check_cpu
|
||||||
|
# 2) check_cpu --warn=50 --crit=75
|
||||||
|
# 3) check_cpu --all-cpus
|
||||||
|
|
||||||
|
#####
|
||||||
|
|
||||||
|
import sys
|
||||||
|
|
||||||
|
result = {'rc': 0, 'text': [], 'perfdata': [], 'values': {}}
|
||||||
|
default_config_file = '/etc/stig/env_supervision'
|
||||||
|
|
||||||
|
try:
|
||||||
|
|
||||||
|
import re, argparse, traceback, time, operator, os, pwd
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--warn",
|
||||||
|
help = "Threshold percent for warning (default 60)",
|
||||||
|
type = int,
|
||||||
|
default = 60
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--crit",
|
||||||
|
help = "Threshold percent for critical (default 80)",
|
||||||
|
type = int,
|
||||||
|
default = 80
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--all-cpus",
|
||||||
|
help = "Enable perfdata for each core",
|
||||||
|
action = 'store_true'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--config-file",
|
||||||
|
help = "Configuration file with bash-style variables declared (default file is " +
|
||||||
|
default_config_file + ")\nCHECK_CPU_WARN=integer ([0-100])\n\nCHECK_CPU_CRIT=integer ([0-100])\n" +
|
||||||
|
"CHECK_CPU_ALL_CPUS=boolean ([0|1])\nCHECK_CPU_ONLY_TOTAL=boolean ([0|1])",
|
||||||
|
nargs = 1
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--only-total",
|
||||||
|
help = "Display only the total usage for perfdata",
|
||||||
|
action = 'store_true'
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Define the config file to use
|
||||||
|
if args.config_file is not None:
|
||||||
|
config_file = args.config_file[0]
|
||||||
|
else:
|
||||||
|
config_file = default_config_file
|
||||||
|
|
||||||
|
# Check if value is boolean
|
||||||
|
true_strings = ['1', 'true', 'True', 'yes', 'y', 'Yes']
|
||||||
|
false_strings = ['0', 'false', 'False', 'no', 'n', 'No']
|
||||||
|
def check_boolean(string):
|
||||||
|
if string in true_strings:
|
||||||
|
return True
|
||||||
|
elif string in false_strings:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
raise ValueError('I can\'t make a boolean out of that :', string)
|
||||||
|
|
||||||
|
# Try to open it
|
||||||
|
try:
|
||||||
|
with open(config_file, 'r') as config:
|
||||||
|
for line in config:
|
||||||
|
if line.startswith('CHECK_CPU_WARN='):
|
||||||
|
args.warn = float(re.sub('CHECK_CPU_WARN=', '', line.rstrip()))
|
||||||
|
if line.startswith('CHECK_CPU_CRIT='):
|
||||||
|
args.crit = float(re.sub('CHECK_CPU_CRIT=', '', line.rstrip()))
|
||||||
|
if line.startswith('CHECK_CPU_ALL_CPUS='):
|
||||||
|
res = re.sub('CHECK_CPU_ALL_CPUS=', '', line.rstrip())
|
||||||
|
args.all_cpus = check_boolean(res)
|
||||||
|
if line.startswith('CHECK_CPU_ONLY_TOTAL='):
|
||||||
|
res = re.sub('CHECK_CPU_ONLY_TOTAL=', '', line.rstrip())
|
||||||
|
args.only_total = check_boolean(res)
|
||||||
|
except IOError:
|
||||||
|
if args.config_file is not None:
|
||||||
|
print("ERROR: the file '" + config_file + "' does not exist !")
|
||||||
|
sys.exit(2)
|
||||||
|
except ValueError as e:
|
||||||
|
print("ERROR: reading the file '" + config_file + "',", e)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
# Check arguments values
|
||||||
|
error = False
|
||||||
|
if args.warn < 0:
|
||||||
|
print("ERROR: --warn can't be negative")
|
||||||
|
error = True
|
||||||
|
elif args.warn > 100:
|
||||||
|
print("ERROR: --warn value exceeds 100")
|
||||||
|
error = True
|
||||||
|
if args.crit < 0:
|
||||||
|
print("ERROR: --crit can't be negative")
|
||||||
|
error = True
|
||||||
|
elif args.crit > 100:
|
||||||
|
print("ERROR: --crit value exceeds 100")
|
||||||
|
error = True
|
||||||
|
if args.crit < args.warn:
|
||||||
|
print("ERROR: --crit value is less than --warn value")
|
||||||
|
error = True
|
||||||
|
|
||||||
|
if error:
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
# /proc/stat cpu columns :
|
||||||
|
# user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice
|
||||||
|
re_cpu = re.compile('^cpu\s+((?:\d+\s)+\d+)\s*$')
|
||||||
|
re_cpu_core = re.compile('^(cpu\d+)\s+((?:\d+\s)+\d+)\s*$')
|
||||||
|
|
||||||
|
# Define where we store the last check data
|
||||||
|
current_user = pwd.getpwuid(os.getuid())[0]
|
||||||
|
last_check_file = '/tmp/.monitoring-' + current_user + '/proc_stat'
|
||||||
|
|
||||||
|
def read_proc_stat(stat):
|
||||||
|
cpu_result = []
|
||||||
|
cpu_cores_result = {}
|
||||||
|
# Parse each line of the file
|
||||||
|
for line in stat:
|
||||||
|
# Do not check for '^cpu ' if it is already done
|
||||||
|
if len(cpu_result) == 0:
|
||||||
|
re_cpu_result = re_cpu.match(line)
|
||||||
|
if re_cpu_result is not None:
|
||||||
|
cpu_result = re_cpu_result.group(1)
|
||||||
|
if not args.all_cpus:
|
||||||
|
break
|
||||||
|
# Check for '^cpu[0-9] '
|
||||||
|
else:
|
||||||
|
re_cpu_core_result = re_cpu_core.match(line)
|
||||||
|
if re_cpu_core_result is not None:
|
||||||
|
cpu_cores_result[re_cpu_core_result.group(1)] = re_cpu_core_result.group(2)
|
||||||
|
# Do not check lines after '^cpu[0-9] '
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
return cpu_result, cpu_cores_result
|
||||||
|
|
||||||
|
# Read /proc/stat values registered from previous check
|
||||||
|
prev_cpu_result = []
|
||||||
|
prev_cpu_cores_result = {}
|
||||||
|
try:
|
||||||
|
with open(last_check_file, 'r') as stat:
|
||||||
|
prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
|
||||||
|
previous_check_file_exists = True
|
||||||
|
# If file is not present, do check on current /proc/stat file and wait 10 seconds
|
||||||
|
except IOError:
|
||||||
|
with open('/proc/stat', 'r') as stat:
|
||||||
|
prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
|
||||||
|
time.sleep(10)
|
||||||
|
|
||||||
|
# Read /proc/stat file
|
||||||
|
cpu_result = []
|
||||||
|
cpu_cores_result = {}
|
||||||
|
with open('/proc/stat', 'r') as stat:
|
||||||
|
cpu_result, cpu_cores_result = read_proc_stat(stat)
|
||||||
|
# Create directory that will store the results for the next check
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(last_check_file))
|
||||||
|
except FileExistsError:
|
||||||
|
pass
|
||||||
|
# Write results for the next check
|
||||||
|
with open(last_check_file, 'w') as stat:
|
||||||
|
stat.write('cpu ' + cpu_result + '\n')
|
||||||
|
for cpu_core, cpu_core_result in cpu_cores_result.items():
|
||||||
|
stat.write(cpu_core + ' ' + cpu_core_result + '\n')
|
||||||
|
|
||||||
|
# For the whole CPU
|
||||||
|
current_values = map(int, cpu_result.split(' '))
|
||||||
|
previous_values = map(int, prev_cpu_result.split(' '))
|
||||||
|
diff_values = list(map(operator.sub, current_values, previous_values))
|
||||||
|
total_time = 0
|
||||||
|
for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
|
||||||
|
total_time += diff_values[i]
|
||||||
|
total_usage = int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100)
|
||||||
|
if total_usage > args.warn:
|
||||||
|
if total_usage > args.crit:
|
||||||
|
result['rc'] = 2
|
||||||
|
else:
|
||||||
|
result['rc'] = 1
|
||||||
|
result['text'].append('CPU_USAGE=' + str(total_usage) + '%')
|
||||||
|
result['values']['cpu'] = {
|
||||||
|
'total_usage': total_usage,
|
||||||
|
'user': int((diff_values[0] / total_time) * 100),
|
||||||
|
'nice': int((diff_values[1] / total_time) * 100),
|
||||||
|
'system': int((diff_values[2] / total_time) * 100),
|
||||||
|
'idle': int((diff_values[3] / total_time) * 100),
|
||||||
|
'iowait': int((diff_values[4] / total_time) * 100),
|
||||||
|
'irq': int((diff_values[5] / total_time) * 100),
|
||||||
|
'softirq': int((diff_values[6] / total_time) * 100),
|
||||||
|
'steal': int((diff_values[7] / total_time) * 100),
|
||||||
|
'guest': int((diff_values[8] / total_time) * 100),
|
||||||
|
'guest_nice': int((diff_values[9] / total_time) * 100)
|
||||||
|
}
|
||||||
|
|
||||||
|
# For each core
|
||||||
|
if args.all_cpus:
|
||||||
|
for cpu_core in cpu_cores_result:
|
||||||
|
if cpu_core in prev_cpu_cores_result:
|
||||||
|
current_values = map(int, cpu_cores_result[cpu_core].split(' '))
|
||||||
|
previous_values = map(int, prev_cpu_cores_result[cpu_core].split(' '))
|
||||||
|
diff_values = list(map(operator.sub, current_values, previous_values))
|
||||||
|
total_time = 0
|
||||||
|
for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
|
||||||
|
total_time += diff_values[i]
|
||||||
|
idle_time = diff_values[3] + diff_values[4] # idle + iowait
|
||||||
|
result['values'][cpu_core] = {
|
||||||
|
'total_usage': int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100),
|
||||||
|
'user': int((diff_values[0] / total_time) * 100),
|
||||||
|
'nice': int((diff_values[1] / total_time) * 100),
|
||||||
|
'system': int((diff_values[2] / total_time) * 100),
|
||||||
|
'idle': int((diff_values[3] / total_time) * 100),
|
||||||
|
'iowait': int((diff_values[4] / total_time) * 100),
|
||||||
|
'irq': int((diff_values[5] / total_time) * 100),
|
||||||
|
'softirq': int((diff_values[6] / total_time) * 100),
|
||||||
|
'steal': int((diff_values[7] / total_time) * 100),
|
||||||
|
'guest': int((diff_values[8] / total_time) * 100),
|
||||||
|
'guest_nice': int((diff_values[9] / total_time) * 100)
|
||||||
|
}
|
||||||
|
|
||||||
|
#
|
||||||
|
# PERFDATA
|
||||||
|
#
|
||||||
|
|
||||||
|
for cpu, keys in result['values'].items():
|
||||||
|
if args.only_total:
|
||||||
|
keys = {'total_usage': keys['total_usage']}
|
||||||
|
for key in keys:
|
||||||
|
string = cpu + '_' + key + '=' + str(keys[key]) + '%;'
|
||||||
|
if cpu == 'cpu' and key == 'total_usage':
|
||||||
|
string += str(args.warn) + ';' + str(args.crit) + ';'
|
||||||
|
else:
|
||||||
|
string += ';;'
|
||||||
|
string += '0;100'
|
||||||
|
result['perfdata'].append(string)
|
||||||
|
|
||||||
|
#
|
||||||
|
# OUTPUT AND EXIT
|
||||||
|
#
|
||||||
|
|
||||||
|
if result['rc'] == 0:
|
||||||
|
print("OK -", result['text'][0], end='')
|
||||||
|
elif result['rc'] == 1:
|
||||||
|
print("WARNING:", " - ".join(result['text']), end='')
|
||||||
|
else:
|
||||||
|
print("CRITICAL:", " - ".join(result['text']), end='')
|
||||||
|
|
||||||
|
print(" |", " ".join(result['perfdata']))
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
print("CRITICAL:", traceback.format_exc())
|
||||||
|
print("\n".join(result['text']))
|
||||||
|
sys.exit(2)
|
Loading…
Reference in New Issue
Block a user