#!/usr/bin/env python3 # Copyright © 2019 Aurélien Grimal - aurelien.grimal@tech-tips.fr # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . ##### # Usage examples : # 1) check_cpu # 2) check_cpu --warn=50 --crit=75 # 3) check_cpu --all-cpus ##### import sys result = {'rc': 0, 'text': [], 'perfdata': [], 'values': {}} default_config_file = '/etc/zorval/env_check_cpu' try: import re, argparse, traceback, time, operator, os, pwd parser = argparse.ArgumentParser() parser.add_argument( "--warn", help = "Threshold percent for warning (default 60)", type = int, default = 60 ) parser.add_argument( "--crit", help = "Threshold percent for critical (default 80)", type = int, default = 80 ) parser.add_argument( "--all-cpus", help = "Enable perfdata for each core", action = 'store_true' ) parser.add_argument( "--config-file", help = "Configuration file with bash-style variables declared (default file is " + default_config_file + ")\nCHECK_CPU_WARN=integer ([0-100])\n\nCHECK_CPU_CRIT=integer ([0-100])\n" + "CHECK_CPU_ALL_CPUS=boolean ([0|1])\nCHECK_CPU_ONLY_TOTAL=boolean ([0|1])", nargs = 1 ) parser.add_argument( "--only-total", help = "Display only the total usage for perfdata", action = 'store_true' ) args = parser.parse_args() # Define the config file to use if args.config_file is not None: config_file = args.config_file[0] else: config_file = default_config_file # Check if value is boolean true_strings = ['1', 'true', 'True', 'yes', 'y', 'Yes'] false_strings = ['0', 'false', 'False', 'no', 'n', 'No'] def check_boolean(string): if string in true_strings: return True elif string in false_strings: return False else: raise ValueError('I can\'t make a boolean out of that :', string) # Try to open it try: with open(config_file, 'r') as config: for line in config: if line.startswith('CHECK_CPU_WARN='): args.warn = float(re.sub('CHECK_CPU_WARN=', '', line.rstrip())) if line.startswith('CHECK_CPU_CRIT='): args.crit = float(re.sub('CHECK_CPU_CRIT=', '', line.rstrip())) if line.startswith('CHECK_CPU_ALL_CPUS='): res = re.sub('CHECK_CPU_ALL_CPUS=', '', line.rstrip()) args.all_cpus = check_boolean(res) if line.startswith('CHECK_CPU_ONLY_TOTAL='): res = re.sub('CHECK_CPU_ONLY_TOTAL=', '', line.rstrip()) args.only_total = check_boolean(res) except IOError: if args.config_file is not None: print("ERROR: the file '" + config_file + "' does not exist !") sys.exit(2) except ValueError as e: print("ERROR: reading the file '" + config_file + "',", e) sys.exit(2) # Check arguments values error = False if args.warn < 0: print("ERROR: --warn can't be negative") error = True elif args.warn > 100: print("ERROR: --warn value exceeds 100") error = True if args.crit < 0: print("ERROR: --crit can't be negative") error = True elif args.crit > 100: print("ERROR: --crit value exceeds 100") error = True if args.crit < args.warn: print("ERROR: --crit value is less than --warn value") error = True if error: sys.exit(2) # /proc/stat cpu columns : # user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice re_cpu = re.compile('^cpu\s+((?:\d+\s)+\d+)\s*$') re_cpu_core = re.compile('^(cpu\d+)\s+((?:\d+\s)+\d+)\s*$') # Define where we store the last check data current_user = pwd.getpwuid(os.getuid())[0] last_check_file = '/tmp/.monitoring-' + current_user + '/proc_stat' def read_proc_stat(stat): cpu_result = [] cpu_cores_result = {} # Parse each line of the file for line in stat: # Do not check for '^cpu ' if it is already done if len(cpu_result) == 0: re_cpu_result = re_cpu.match(line) if re_cpu_result is not None: cpu_result = re_cpu_result.group(1) if not args.all_cpus: break # Check for '^cpu[0-9] ' else: re_cpu_core_result = re_cpu_core.match(line) if re_cpu_core_result is not None: cpu_cores_result[re_cpu_core_result.group(1)] = re_cpu_core_result.group(2) # Do not check lines after '^cpu[0-9] ' else: break return cpu_result, cpu_cores_result # Read /proc/stat values registered from previous check prev_cpu_result = [] prev_cpu_cores_result = {} try: with open(last_check_file, 'r') as stat: prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat) previous_check_file_exists = True # If file is not present, do check on current /proc/stat file and wait 10 seconds except IOError: with open('/proc/stat', 'r') as stat: prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat) time.sleep(10) # Read /proc/stat file cpu_result = [] cpu_cores_result = {} with open('/proc/stat', 'r') as stat: cpu_result, cpu_cores_result = read_proc_stat(stat) # Create directory that will store the results for the next check try: os.makedirs(os.path.dirname(last_check_file)) except FileExistsError: pass # Write results for the next check with open(last_check_file, 'w') as stat: stat.write('cpu ' + cpu_result + '\n') for cpu_core, cpu_core_result in cpu_cores_result.items(): stat.write(cpu_core + ' ' + cpu_core_result + '\n') # For the whole CPU current_values = map(int, cpu_result.split(' ')) previous_values = map(int, prev_cpu_result.split(' ')) diff_values = list(map(operator.sub, current_values, previous_values)) total_time = 0 for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice total_time += diff_values[i] total_usage = int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100) if total_usage > args.warn: if total_usage > args.crit: result['rc'] = 2 else: result['rc'] = 1 result['text'].append('CPU_USAGE=' + str(total_usage) + '%') result['values']['cpu'] = { 'total_usage': total_usage, 'user': int((diff_values[0] / total_time) * 100), 'nice': int((diff_values[1] / total_time) * 100), 'system': int((diff_values[2] / total_time) * 100), 'idle': int((diff_values[3] / total_time) * 100), 'iowait': int((diff_values[4] / total_time) * 100), 'irq': int((diff_values[5] / total_time) * 100), 'softirq': int((diff_values[6] / total_time) * 100), 'steal': int((diff_values[7] / total_time) * 100), 'guest': int((diff_values[8] / total_time) * 100), 'guest_nice': int((diff_values[9] / total_time) * 100) } # For each core if args.all_cpus: for cpu_core in cpu_cores_result: if cpu_core in prev_cpu_cores_result: current_values = map(int, cpu_cores_result[cpu_core].split(' ')) previous_values = map(int, prev_cpu_cores_result[cpu_core].split(' ')) diff_values = list(map(operator.sub, current_values, previous_values)) total_time = 0 for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice total_time += diff_values[i] idle_time = diff_values[3] + diff_values[4] # idle + iowait result['values'][cpu_core] = { 'total_usage': int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100), 'user': int((diff_values[0] / total_time) * 100), 'nice': int((diff_values[1] / total_time) * 100), 'system': int((diff_values[2] / total_time) * 100), 'idle': int((diff_values[3] / total_time) * 100), 'iowait': int((diff_values[4] / total_time) * 100), 'irq': int((diff_values[5] / total_time) * 100), 'softirq': int((diff_values[6] / total_time) * 100), 'steal': int((diff_values[7] / total_time) * 100), 'guest': int((diff_values[8] / total_time) * 100), 'guest_nice': int((diff_values[9] / total_time) * 100) } # # PERFDATA # for cpu, keys in result['values'].items(): if args.only_total: keys = {'total_usage': keys['total_usage']} for key in keys: string = cpu + '_' + key + '=' + str(keys[key]) + '%;' if cpu == 'cpu' and key == 'total_usage': string += str(args.warn) + ';' + str(args.crit) + ';' else: string += ';;' string += '0;100' result['perfdata'].append(string) # # OUTPUT AND EXIT # if result['rc'] == 0: print("OK -", result['text'][0], end='') elif result['rc'] == 1: print("WARNING:", " - ".join(result['text']), end='') else: print("CRITICAL:", " - ".join(result['text']), end='') print(" |", " ".join(result['perfdata'])) except Exception: print("CRITICAL:", traceback.format_exc()) print("\n".join(result['text'])) sys.exit(2)