conf-10-check-nrpe/conf/common/usr/local/bin/check_cpu

#!/usr/bin/env python3

# Copyright © 2019 Aurélien Grimal - aurelien.grimal@tech-tips.fr

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.

#####

# Usage examples :
# 1) check_cpu
# 2) check_cpu --warn=50 --crit=75
# 3) check_cpu --all-cpus

#####

import sys

result = {'rc': 0, 'text': [], 'perfdata': [], 'values': {}}
default_config_file = '/etc/env_check_nrpe'

try:

    import re, argparse, traceback, time, operator, os, pwd

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--warn",
        help = "Threshold percent for warning (default 60)",
        type = int,
        default = 60
    )
    parser.add_argument(
        "--crit",
        help = "Threshold percent for critical (default 80)",
        type = int,
        default = 80
    )
    parser.add_argument(
        "--all-cpus",
        help = "Enable perfdata for each core",
        action = 'store_true'
    )
    parser.add_argument(
        "--config-file",
        help = "Configuration file with bash-style variables declared (default file is " +
            default_config_file + ")\nCHECK_CPU_WARN=integer ([0-100])\n\nCHECK_CPU_CRIT=integer ([0-100])\n" +
            "CHECK_CPU_ALL_CPUS=boolean ([0|1])\nCHECK_CPU_ONLY_TOTAL=boolean ([0|1])",
        nargs = 1
    )
    parser.add_argument(
        "--only-total",
        help = "Display only the total usage for perfdata",
        action = 'store_true'
    )

    args = parser.parse_args()

    # Define the config file to use
    if args.config_file is not None:
        config_file = args.config_file[0]
    else:
        config_file = default_config_file

    # Check if value is boolean
    true_strings = ['1', 'true', 'True', 'yes', 'y', 'Yes']
    false_strings = ['0', 'false', 'False', 'no', 'n', 'No']
    def check_boolean(string):
        if string in true_strings:
            return True
        elif string in false_strings:
            return False
        else:
            raise ValueError('I can\'t make a boolean out of that :', string)

    # Try to open it
    try:
        with open(config_file, 'r') as config:
            for line in config:
                if line.startswith('CHECK_CPU_WARN='):
                    args.warn = float(re.sub('CHECK_CPU_WARN=', '', line.rstrip()))
                if line.startswith('CHECK_CPU_CRIT='):
                    args.crit = float(re.sub('CHECK_CPU_CRIT=', '', line.rstrip()))
                if line.startswith('CHECK_CPU_ALL_CPUS='):
                    res = re.sub('CHECK_CPU_ALL_CPUS=', '', line.rstrip())
                    args.all_cpus = check_boolean(res)
                if line.startswith('CHECK_CPU_ONLY_TOTAL='):
                    res = re.sub('CHECK_CPU_ONLY_TOTAL=', '', line.rstrip())
                    args.only_total = check_boolean(res)
    except IOError:
        if args.config_file is not None:
            print("ERROR: the file '" + config_file + "' does not exist !")
            sys.exit(2)
    except ValueError as e:
        print("ERROR: reading the file '" + config_file + "',", e)
        sys.exit(2)

    # Check arguments values
    error = False
    if args.warn < 0:
        print("ERROR: --warn can't be negative")
        error = True
    elif args.warn > 100:
        print("ERROR: --warn value exceeds 100")
        error = True
    if args.crit < 0:
        print("ERROR: --crit can't be negative")
        error = True
    elif args.crit > 100:
        print("ERROR: --crit value exceeds 100")
        error = True
    if args.crit < args.warn:
        print("ERROR: --crit value is less than --warn value")
        error = True

    if error:
        sys.exit(2)

    # /proc/stat cpu columns :
    # user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice
    re_cpu = re.compile('^cpu\s+((?:\d+\s)+\d+)\s*$')
    re_cpu_core = re.compile('^(cpu\d+)\s+((?:\d+\s)+\d+)\s*$')

    # Define where we store the last check data
    current_user = pwd.getpwuid(os.getuid())[0]
    last_check_file = '/tmp/.monitoring-' + current_user + '/proc_stat'

    def read_proc_stat(stat):
        cpu_result = []
        cpu_cores_result = {}
        # Parse each line of the file
        for line in stat:
            # Do not check for '^cpu ' if it is already done
            if len(cpu_result) == 0:
                re_cpu_result = re_cpu.match(line)
                if re_cpu_result is not None:
                    cpu_result = re_cpu_result.group(1)
                    if not args.all_cpus:
                        break
            # Check for '^cpu[0-9] '
            else:
                re_cpu_core_result = re_cpu_core.match(line)
                if re_cpu_core_result is not None:
                    cpu_cores_result[re_cpu_core_result.group(1)] = re_cpu_core_result.group(2)
                # Do not check lines after '^cpu[0-9] '
                else:
                    break
        return cpu_result, cpu_cores_result

    # Read /proc/stat values registered from previous check
    prev_cpu_result = []
    prev_cpu_cores_result = {}
    try:
        with open(last_check_file, 'r') as stat:
            prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
            previous_check_file_exists = True
    # If file is not present, do check on current /proc/stat file and wait 10 seconds
    except IOError:
        with open('/proc/stat', 'r') as stat:
            prev_cpu_result, prev_cpu_cores_result = read_proc_stat(stat)
            time.sleep(10)

    # Read /proc/stat file
    cpu_result = []
    cpu_cores_result = {}
    with open('/proc/stat', 'r') as stat:
        cpu_result, cpu_cores_result = read_proc_stat(stat)
        # Create directory that will store the results for the next check
        try:
            os.makedirs(os.path.dirname(last_check_file))
        except FileExistsError:
            pass
        # Write results for the next check
        with open(last_check_file, 'w') as stat:
            stat.write('cpu ' + cpu_result + '\n')
            for cpu_core, cpu_core_result in cpu_cores_result.items():
                stat.write(cpu_core + ' ' + cpu_core_result + '\n')

    # For the whole CPU
    current_values = map(int, cpu_result.split(' '))
    previous_values = map(int, prev_cpu_result.split(' '))
    diff_values = list(map(operator.sub, current_values, previous_values))
    total_time = 0
    for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
        total_time += diff_values[i]
    total_usage = int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100)
    if total_usage > args.warn:
        if total_usage > args.crit:
            result['rc'] = 2
        else:
            result['rc'] = 1
    result['text'].append('CPU_USAGE=' + str(total_usage) + '%')
    result['values']['cpu'] = {
        'total_usage': total_usage,
        'user':        int((diff_values[0] / total_time) * 100),
        'nice':        int((diff_values[1] / total_time) * 100),
        'system':      int((diff_values[2] / total_time) * 100),
        'idle':        int((diff_values[3] / total_time) * 100),
        'iowait':      int((diff_values[4] / total_time) * 100),
        'irq':         int((diff_values[5] / total_time) * 100),
        'softirq':     int((diff_values[6] / total_time) * 100),
        'steal':       int((diff_values[7] / total_time) * 100),
        'guest':       int((diff_values[8] / total_time) * 100),
        'guest_nice':  int((diff_values[9] / total_time) * 100)
    }

    # For each core
    if args.all_cpus:
        for cpu_core in cpu_cores_result:
            if cpu_core in prev_cpu_cores_result:
                current_values = map(int, cpu_cores_result[cpu_core].split(' '))
                previous_values = map(int, prev_cpu_cores_result[cpu_core].split(' '))
                diff_values = list(map(operator.sub, current_values, previous_values))
                total_time = 0
                for i in range(8): # exclude guest and guest_nice because they are accounted in user and nice
                    total_time += diff_values[i]
                idle_time = diff_values[3] + diff_values[4] # idle + iowait
                result['values'][cpu_core] = {
                    'total_usage': int(((total_time - diff_values[3] - diff_values[4]) / total_time) * 100),
                    'user':        int((diff_values[0] / total_time) * 100),
                    'nice':        int((diff_values[1] / total_time) * 100),
                    'system':      int((diff_values[2] / total_time) * 100),
                    'idle':        int((diff_values[3] / total_time) * 100),
                    'iowait':      int((diff_values[4] / total_time) * 100),
                    'irq':         int((diff_values[5] / total_time) * 100),
                    'softirq':     int((diff_values[6] / total_time) * 100),
                    'steal':       int((diff_values[7] / total_time) * 100),
                    'guest':       int((diff_values[8] / total_time) * 100),
                    'guest_nice':  int((diff_values[9] / total_time) * 100)
                }

    #
    # PERFDATA
    #

    for cpu, keys in result['values'].items():
        if args.only_total:
            keys = {'total_usage': keys['total_usage']}
        for key in keys:
            string = cpu + '_' + key + '=' + str(keys[key]) + '%;'
            if cpu == 'cpu' and key == 'total_usage':
                string += str(args.warn) + ';' + str(args.crit) + ';'
            else:
                string += ';;'
            string += '0;100'
            result['perfdata'].append(string)

    #
    # OUTPUT AND EXIT
    #

    if result['rc'] == 0:
        print("OK -", result['text'][0], end='')
    elif result['rc'] == 1:
        print("WARNING:", " - ".join(result['text']), end='')
    else:
        print("CRITICAL:", " - ".join(result['text']), end='')

    print(" |", " ".join(result['perfdata']))

except Exception:
    print("CRITICAL:", traceback.format_exc())
    print("\n".join(result['text']))
    sys.exit(2)