From f81bae86bbd39b139d659ccb736aad9950037d51 Mon Sep 17 00:00:00 2001 From: Alban VIDAL Date: Sun, 29 Sep 2019 22:33:09 +0200 Subject: [PATCH] =?UTF-8?q?Add=20check:=20check=5Fcpu=20(from=20Aur=C3=A9l?= =?UTF-8?q?ien=20Grimal)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conf/usr/local/bin/check_cpu | 275 +++++++++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100755 conf/usr/local/bin/check_cpu diff --git a/conf/usr/local/bin/check_cpu b/conf/usr/local/bin/check_cpu new file mode 100755 index 0000000..3151639 --- /dev/null +++ b/conf/usr/local/bin/check_cpu @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +# Copyright © 2019 Aurélien Grimal - aurelien.grimal@tech-tips.fr + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +##### + +# Usage examples : +# 1) check_cpu +# 2) check_cpu --warn=50 --crit=75 +# 3) check_cpu --all-cpus + +##### + +import sys + +result = {'rc': 0, 'text': [], 'perfdata': [], 'values': {}} +default_config_file = '/etc/stig/env_supervision' + +try: + + import re, argparse, traceback, time, operator, os, pwd + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--warn", + help = "Threshold percent for warning (default 60)", + type = int, + default = 60 + ) + parser.add_argument( + "--crit", + help = "Threshold percent for critical (default 80)", + type = int, + default = 80 + ) + parser.add_argument( + "--all-cpus", + help = "Enable perfdata for each core", + action = 'store_true' + ) + parser.add_argument( + "--config-file", + help = "Configuration file with bash-style variables declared (default file is " + + default_config_file + ")\nCHECK_CPU_WARN=integer ([0-100])\n\nCHECK_CPU_CRIT=integer ([0-100])\n" + + "CHECK_CPU_ALL_CPUS=boolean ([0|1])\nCHECK_CPU_ONLY_TOTAL=boolean ([0|1])", + nargs = 1 + ) + parser.add_argument( + "--only-total", + help = "Display only the total usage for perfdata", + action = 'store_true' + ) + + args = parser.parse_args() + + # Define the config file to use + if args.config_file is not None: + config_file = args.config_file[0] + else: + config_file = default_config_file + + # Check if value is boolean + true_strings = ['1', 'true', 'True', 'yes', 'y', 'Yes'] + false_strings = ['0', 'false', 'False', 'no', 'n', 'No'] + def check_boolean(string): + if string in true_strings: + return True + elif string in false_strings: + return False + else: + raise ValueError('I can\'t make a boolean out of that :', string) + + # Try to open it + try: + with open(config_file, 'r') as config: + for line in config: + if line.startswith('CHECK_CPU_WARN='): + args.warn = float(re.sub('CHECK_CPU_WARN=', '', line.rstrip())) + if line.startswith('CHECK_CPU_CRIT='): + args.crit = float(re.sub('CHECK_CPU_CRIT=', '', line.rstrip())) + if line.startswith('CHECK_CPU_ALL_CPUS='): + res = re.sub('CHECK_CPU_ALL_CPUS=', '', line.rstrip()) + args.all_cpus = check_boolean(res) + if line.startswith('CHECK_CPU_ONLY_TOTAL='): + res = re.sub('CHECK_CPU_ONLY_TOTAL=', '', line.rstrip()) + args.only_total = check_boolean(res) + except IOError: + if args.config_file is not None: + print("ERROR: the file '" + config_file + "' does not exist !") + sys.exit(2) + except ValueError as e: + print("ERROR: reading the file '" + config_file + "',", e) + sys.exit(2) + + # Check arguments values + error = False + if args.warn < 0: + print("ERROR: --warn can't be negative") + error = True + elif args.warn > 100: + print("ERROR: --warn value exceeds 100") + error = True + if args.crit < 0: + print("ERROR: --crit can't be negative") + error = True + elif args.crit > 100: + print("ERROR: --crit value exceeds 100") + error = True + if args.crit < args.warn: + print("ERROR: --crit value is less than --warn value") + error = True + + if error: + sys.exit(2) + + # /proc/stat cpu columns : + # user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice + re_cpu = re.compile('^cpu\s+((?:\d+\s)+\d+)\s*$') + re_cpu_core = re.compile('^(cpu\d+)\s+((?:\d+\s)+\d+)\s*$') + + # Define where we store the last check data + current_user = pwd.getpwuid(os.getuid())[0] + last_check_file = '/tmp/.monitoring-' + current_user + '/proc_stat' + + def read_proc_stat(stat): + cpu_result = [] + cpu_cores_result = {} + # Parse each line of the file + for line in stat: + # Do not check for '^cpu ' if it is already done + if len(cpu_result) == 0: + re_cpu_result = re_cpu.match(line) + if re_cpu_result is not None: + cpu_result = re_cpu_result.group(1) + if not args.all_cpus: + break + # Check for '^cpu[0-9] ' + else: + re_cpu_core_result = re_cpu_core.match(line) + if re_cpu_core_result is not None: + cpu_cores_result[re_cpu_core_result.group(1)] = re_cpu_core_result.group(2) + # Do not check lines after '^cpu[0-9] ' + else: + break + return cpu_result, cpu_cores_result + + # Read /proc/stat values registered from previous check + prev_cpu_result = [] + prev_cpu_cores_result = {} + try: + with open(last_check_file, 'r') as stat: + prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat) + previous_check_file_exists = True + # If file is not present, do check on current /proc/stat file and wait 10 seconds + except IOError: + with open('/proc/stat', 'r') as stat: + prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat) + time.sleep(10) + + # Read /proc/stat file + cpu_result = [] + cpu_cores_result = {} + with open('/proc/stat', 'r') as stat: + cpu_result, cpu_cores_result = read_proc_stat(stat) + # Create directory that will store the results for the next check + try: + os.makedirs(os.path.dirname(last_check_file)) + except FileExistsError: + pass + # Write results for the next check + with open(last_check_file, 'w') as stat: + stat.write('cpu ' + cpu_result + '\n') + for cpu_core, cpu_core_result in cpu_cores_result.items(): + stat.write(cpu_core + ' ' + cpu_core_result + '\n') + + # For the whole CPU + current_values = map(int, cpu_result.split(' ')) + previous_values = map(int, prev_cpu_result.split(' ')) + diff_values = list(map(operator.sub, current_values, previous_values)) + total_time = 0 + for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice + total_time += diff_values[i] + total_usage = int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100) + if total_usage > args.warn: + if total_usage > args.crit: + result['rc'] = 2 + else: + result['rc'] = 1 + result['text'].append('CPU_USAGE=' + str(total_usage) + '%') + result['values']['cpu'] = { + 'total_usage': total_usage, + 'user': int((diff_values[0] / total_time) * 100), + 'nice': int((diff_values[1] / total_time) * 100), + 'system': int((diff_values[2] / total_time) * 100), + 'idle': int((diff_values[3] / total_time) * 100), + 'iowait': int((diff_values[4] / total_time) * 100), + 'irq': int((diff_values[5] / total_time) * 100), + 'softirq': int((diff_values[6] / total_time) * 100), + 'steal': int((diff_values[7] / total_time) * 100), + 'guest': int((diff_values[8] / total_time) * 100), + 'guest_nice': int((diff_values[9] / total_time) * 100) + } + + # For each core + if args.all_cpus: + for cpu_core in cpu_cores_result: + if cpu_core in prev_cpu_cores_result: + current_values = map(int, cpu_cores_result[cpu_core].split(' ')) + previous_values = map(int, prev_cpu_cores_result[cpu_core].split(' ')) + diff_values = list(map(operator.sub, current_values, previous_values)) + total_time = 0 + for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice + total_time += diff_values[i] + idle_time = diff_values[3] + diff_values[4] # idle + iowait + result['values'][cpu_core] = { + 'total_usage': int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100), + 'user': int((diff_values[0] / total_time) * 100), + 'nice': int((diff_values[1] / total_time) * 100), + 'system': int((diff_values[2] / total_time) * 100), + 'idle': int((diff_values[3] / total_time) * 100), + 'iowait': int((diff_values[4] / total_time) * 100), + 'irq': int((diff_values[5] / total_time) * 100), + 'softirq': int((diff_values[6] / total_time) * 100), + 'steal': int((diff_values[7] / total_time) * 100), + 'guest': int((diff_values[8] / total_time) * 100), + 'guest_nice': int((diff_values[9] / total_time) * 100) + } + + # + # PERFDATA + # + + for cpu, keys in result['values'].items(): + if args.only_total: + keys = {'total_usage': keys['total_usage']} + for key in keys: + string = cpu + '_' + key + '=' + str(keys[key]) + '%;' + if cpu == 'cpu' and key == 'total_usage': + string += str(args.warn) + ';' + str(args.crit) + ';' + else: + string += ';;' + string += '0;100' + result['perfdata'].append(string) + + # + # OUTPUT AND EXIT + # + + if result['rc'] == 0: + print("OK -", result['text'][0], end='') + elif result['rc'] == 1: + print("WARNING:", " - ".join(result['text']), end='') + else: + print("CRITICAL:", " - ".join(result['text']), end='') + + print(" |", " ".join(result['perfdata'])) + +except Exception: + print("CRITICAL:", traceback.format_exc()) + print("\n".join(result['text'])) + sys.exit(2)