Add check: check_cpu (from Aurélien Grimal)
This commit is contained in:
parent
c0242386a1
commit
f81bae86bb
275
conf/usr/local/bin/check_cpu
Executable file
275
conf/usr/local/bin/check_cpu
Executable file
@ -0,0 +1,275 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright © 2019 Aurélien Grimal - aurelien.grimal@tech-tips.fr
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
#####
|
||||
|
||||
# Usage examples :
|
||||
# 1) check_cpu
|
||||
# 2) check_cpu --warn=50 --crit=75
|
||||
# 3) check_cpu --all-cpus
|
||||
|
||||
#####
|
||||
|
||||
import sys
|
||||
|
||||
result = {'rc': 0, 'text': [], 'perfdata': [], 'values': {}}
|
||||
default_config_file = '/etc/stig/env_supervision'
|
||||
|
||||
try:
|
||||
|
||||
import re, argparse, traceback, time, operator, os, pwd
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--warn",
|
||||
help = "Threshold percent for warning (default 60)",
|
||||
type = int,
|
||||
default = 60
|
||||
)
|
||||
parser.add_argument(
|
||||
"--crit",
|
||||
help = "Threshold percent for critical (default 80)",
|
||||
type = int,
|
||||
default = 80
|
||||
)
|
||||
parser.add_argument(
|
||||
"--all-cpus",
|
||||
help = "Enable perfdata for each core",
|
||||
action = 'store_true'
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config-file",
|
||||
help = "Configuration file with bash-style variables declared (default file is " +
|
||||
default_config_file + ")\nCHECK_CPU_WARN=integer ([0-100])\n\nCHECK_CPU_CRIT=integer ([0-100])\n" +
|
||||
"CHECK_CPU_ALL_CPUS=boolean ([0|1])\nCHECK_CPU_ONLY_TOTAL=boolean ([0|1])",
|
||||
nargs = 1
|
||||
)
|
||||
parser.add_argument(
|
||||
"--only-total",
|
||||
help = "Display only the total usage for perfdata",
|
||||
action = 'store_true'
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Define the config file to use
|
||||
if args.config_file is not None:
|
||||
config_file = args.config_file[0]
|
||||
else:
|
||||
config_file = default_config_file
|
||||
|
||||
# Check if value is boolean
|
||||
true_strings = ['1', 'true', 'True', 'yes', 'y', 'Yes']
|
||||
false_strings = ['0', 'false', 'False', 'no', 'n', 'No']
|
||||
def check_boolean(string):
|
||||
if string in true_strings:
|
||||
return True
|
||||
elif string in false_strings:
|
||||
return False
|
||||
else:
|
||||
raise ValueError('I can\'t make a boolean out of that :', string)
|
||||
|
||||
# Try to open it
|
||||
try:
|
||||
with open(config_file, 'r') as config:
|
||||
for line in config:
|
||||
if line.startswith('CHECK_CPU_WARN='):
|
||||
args.warn = float(re.sub('CHECK_CPU_WARN=', '', line.rstrip()))
|
||||
if line.startswith('CHECK_CPU_CRIT='):
|
||||
args.crit = float(re.sub('CHECK_CPU_CRIT=', '', line.rstrip()))
|
||||
if line.startswith('CHECK_CPU_ALL_CPUS='):
|
||||
res = re.sub('CHECK_CPU_ALL_CPUS=', '', line.rstrip())
|
||||
args.all_cpus = check_boolean(res)
|
||||
if line.startswith('CHECK_CPU_ONLY_TOTAL='):
|
||||
res = re.sub('CHECK_CPU_ONLY_TOTAL=', '', line.rstrip())
|
||||
args.only_total = check_boolean(res)
|
||||
except IOError:
|
||||
if args.config_file is not None:
|
||||
print("ERROR: the file '" + config_file + "' does not exist !")
|
||||
sys.exit(2)
|
||||
except ValueError as e:
|
||||
print("ERROR: reading the file '" + config_file + "',", e)
|
||||
sys.exit(2)
|
||||
|
||||
# Check arguments values
|
||||
error = False
|
||||
if args.warn < 0:
|
||||
print("ERROR: --warn can't be negative")
|
||||
error = True
|
||||
elif args.warn > 100:
|
||||
print("ERROR: --warn value exceeds 100")
|
||||
error = True
|
||||
if args.crit < 0:
|
||||
print("ERROR: --crit can't be negative")
|
||||
error = True
|
||||
elif args.crit > 100:
|
||||
print("ERROR: --crit value exceeds 100")
|
||||
error = True
|
||||
if args.crit < args.warn:
|
||||
print("ERROR: --crit value is less than --warn value")
|
||||
error = True
|
||||
|
||||
if error:
|
||||
sys.exit(2)
|
||||
|
||||
# /proc/stat cpu columns :
|
||||
# user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice
|
||||
re_cpu = re.compile('^cpu\s+((?:\d+\s)+\d+)\s*$')
|
||||
re_cpu_core = re.compile('^(cpu\d+)\s+((?:\d+\s)+\d+)\s*$')
|
||||
|
||||
# Define where we store the last check data
|
||||
current_user = pwd.getpwuid(os.getuid())[0]
|
||||
last_check_file = '/tmp/.monitoring-' + current_user + '/proc_stat'
|
||||
|
||||
def read_proc_stat(stat):
|
||||
cpu_result = []
|
||||
cpu_cores_result = {}
|
||||
# Parse each line of the file
|
||||
for line in stat:
|
||||
# Do not check for '^cpu ' if it is already done
|
||||
if len(cpu_result) == 0:
|
||||
re_cpu_result = re_cpu.match(line)
|
||||
if re_cpu_result is not None:
|
||||
cpu_result = re_cpu_result.group(1)
|
||||
if not args.all_cpus:
|
||||
break
|
||||
# Check for '^cpu[0-9] '
|
||||
else:
|
||||
re_cpu_core_result = re_cpu_core.match(line)
|
||||
if re_cpu_core_result is not None:
|
||||
cpu_cores_result[re_cpu_core_result.group(1)] = re_cpu_core_result.group(2)
|
||||
# Do not check lines after '^cpu[0-9] '
|
||||
else:
|
||||
break
|
||||
return cpu_result, cpu_cores_result
|
||||
|
||||
# Read /proc/stat values registered from previous check
|
||||
prev_cpu_result = []
|
||||
prev_cpu_cores_result = {}
|
||||
try:
|
||||
with open(last_check_file, 'r') as stat:
|
||||
prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
|
||||
previous_check_file_exists = True
|
||||
# If file is not present, do check on current /proc/stat file and wait 10 seconds
|
||||
except IOError:
|
||||
with open('/proc/stat', 'r') as stat:
|
||||
prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
|
||||
time.sleep(10)
|
||||
|
||||
# Read /proc/stat file
|
||||
cpu_result = []
|
||||
cpu_cores_result = {}
|
||||
with open('/proc/stat', 'r') as stat:
|
||||
cpu_result, cpu_cores_result = read_proc_stat(stat)
|
||||
# Create directory that will store the results for the next check
|
||||
try:
|
||||
os.makedirs(os.path.dirname(last_check_file))
|
||||
except FileExistsError:
|
||||
pass
|
||||
# Write results for the next check
|
||||
with open(last_check_file, 'w') as stat:
|
||||
stat.write('cpu ' + cpu_result + '\n')
|
||||
for cpu_core, cpu_core_result in cpu_cores_result.items():
|
||||
stat.write(cpu_core + ' ' + cpu_core_result + '\n')
|
||||
|
||||
# For the whole CPU
|
||||
current_values = map(int, cpu_result.split(' '))
|
||||
previous_values = map(int, prev_cpu_result.split(' '))
|
||||
diff_values = list(map(operator.sub, current_values, previous_values))
|
||||
total_time = 0
|
||||
for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
|
||||
total_time += diff_values[i]
|
||||
total_usage = int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100)
|
||||
if total_usage > args.warn:
|
||||
if total_usage > args.crit:
|
||||
result['rc'] = 2
|
||||
else:
|
||||
result['rc'] = 1
|
||||
result['text'].append('CPU_USAGE=' + str(total_usage) + '%')
|
||||
result['values']['cpu'] = {
|
||||
'total_usage': total_usage,
|
||||
'user': int((diff_values[0] / total_time) * 100),
|
||||
'nice': int((diff_values[1] / total_time) * 100),
|
||||
'system': int((diff_values[2] / total_time) * 100),
|
||||
'idle': int((diff_values[3] / total_time) * 100),
|
||||
'iowait': int((diff_values[4] / total_time) * 100),
|
||||
'irq': int((diff_values[5] / total_time) * 100),
|
||||
'softirq': int((diff_values[6] / total_time) * 100),
|
||||
'steal': int((diff_values[7] / total_time) * 100),
|
||||
'guest': int((diff_values[8] / total_time) * 100),
|
||||
'guest_nice': int((diff_values[9] / total_time) * 100)
|
||||
}
|
||||
|
||||
# For each core
|
||||
if args.all_cpus:
|
||||
for cpu_core in cpu_cores_result:
|
||||
if cpu_core in prev_cpu_cores_result:
|
||||
current_values = map(int, cpu_cores_result[cpu_core].split(' '))
|
||||
previous_values = map(int, prev_cpu_cores_result[cpu_core].split(' '))
|
||||
diff_values = list(map(operator.sub, current_values, previous_values))
|
||||
total_time = 0
|
||||
for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
|
||||
total_time += diff_values[i]
|
||||
idle_time = diff_values[3] + diff_values[4] # idle + iowait
|
||||
result['values'][cpu_core] = {
|
||||
'total_usage': int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100),
|
||||
'user': int((diff_values[0] / total_time) * 100),
|
||||
'nice': int((diff_values[1] / total_time) * 100),
|
||||
'system': int((diff_values[2] / total_time) * 100),
|
||||
'idle': int((diff_values[3] / total_time) * 100),
|
||||
'iowait': int((diff_values[4] / total_time) * 100),
|
||||
'irq': int((diff_values[5] / total_time) * 100),
|
||||
'softirq': int((diff_values[6] / total_time) * 100),
|
||||
'steal': int((diff_values[7] / total_time) * 100),
|
||||
'guest': int((diff_values[8] / total_time) * 100),
|
||||
'guest_nice': int((diff_values[9] / total_time) * 100)
|
||||
}
|
||||
|
||||
#
|
||||
# PERFDATA
|
||||
#
|
||||
|
||||
for cpu, keys in result['values'].items():
|
||||
if args.only_total:
|
||||
keys = {'total_usage': keys['total_usage']}
|
||||
for key in keys:
|
||||
string = cpu + '_' + key + '=' + str(keys[key]) + '%;'
|
||||
if cpu == 'cpu' and key == 'total_usage':
|
||||
string += str(args.warn) + ';' + str(args.crit) + ';'
|
||||
else:
|
||||
string += ';;'
|
||||
string += '0;100'
|
||||
result['perfdata'].append(string)
|
||||
|
||||
#
|
||||
# OUTPUT AND EXIT
|
||||
#
|
||||
|
||||
if result['rc'] == 0:
|
||||
print("OK -", result['text'][0], end='')
|
||||
elif result['rc'] == 1:
|
||||
print("WARNING:", " - ".join(result['text']), end='')
|
||||
else:
|
||||
print("CRITICAL:", " - ".join(result['text']), end='')
|
||||
|
||||
print(" |", " ".join(result['perfdata']))
|
||||
|
||||
except Exception:
|
||||
print("CRITICAL:", traceback.format_exc())
|
||||
print("\n".join(result['text']))
|
||||
sys.exit(2)
|
Loading…
Reference in New Issue
Block a user