From feda943270b1c5dc3d7e55180df9e2179247284f Mon Sep 17 00:00:00 2001 From: Angoosh Leviocki Date: Mon, 27 Apr 2026 11:35:05 +0200 Subject: [PATCH] initial commit --- RAS.py | 61 ++++ README.md | 10 +- SMART.py | 65 ++++ config-example.json | 11 + cpuinfo.py | 37 +++ docker.py | 138 ++++++++ gpu.py | 266 +++++++++++++++ hardware-monitor.service | 8 + intelPower.py | 115 +++++++ kvmSensors.py | 65 ++++ main.py | 690 +++++++++++++++++++++++++++++++++++++++ mon_pkg_update.py | 46 +++ packages.py | 140 ++++++++ procMon.py | 128 ++++++++ promMon.py | 58 ++++ ryzenPower.py | 209 ++++++++++++ sensors.py | 65 ++++ sysinfo.py | 244 ++++++++++++++ temps.py | 27 ++ 19 files changed, 2381 insertions(+), 2 deletions(-) create mode 100644 RAS.py create mode 100644 SMART.py create mode 100644 config-example.json create mode 100644 cpuinfo.py create mode 100644 docker.py create mode 100644 gpu.py create mode 100644 hardware-monitor.service create mode 100644 intelPower.py create mode 100644 kvmSensors.py create mode 100644 main.py create mode 100644 mon_pkg_update.py create mode 100644 packages.py create mode 100644 procMon.py create mode 100644 promMon.py create mode 100644 ryzenPower.py create mode 100644 sensors.py create mode 100644 sysinfo.py create mode 100644 temps.py diff --git a/RAS.py b/RAS.py new file mode 100644 index 0000000..bdc4912 --- /dev/null +++ b/RAS.py @@ -0,0 +1,61 @@ +import subprocess + +def convertToDict(RAS_dump): + ras_status = {} + current_driver = "" + lines = RAS_dump.split('\n') + + for line in lines: + if line == '': + continue + if '\t' in line: + try: + item = line.split(': ') + ras_status[current_driver][item[1]] = {} + if "Corrected" in line: + ras_status[current_driver][item[1]]["corrected"] = int(line.replace('\t', '').split(' ')[0]) + ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[0]) + else: + ras_status[current_driver][item[1]]["uncorrected"] = int(line.replace('\t', '').split(' ')[0]) + ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[0]) + except: + item = line.replace('\t', '').split(' ') + ras_status[current_driver][item[0]] = {} + ras_status[current_driver][item[0]]["uncorrected"] = int(line.replace('\t', '').split(' ')[2]) + ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[2]) + else: + current_driver = "" + if "No" in line: + words = line.split(' ') + for word in words: + if word == "No": + continue + if "errors" in word: + current_driver = current_driver[:-1] + break + current_driver += word+" " + else: + words = line.split(' ') + for word in words: + if word == "events": + current_driver = current_driver[:-1] + break + if "errors" in word: + current_driver = current_driver[:-1] + break + current_driver += word+" " + ras_status[current_driver] = {} + ras_status[current_driver]["total_errors"] = 0 + + return ras_status + +def readRAS(): + result = subprocess.run(['ras-mc-ctl', '--summary'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + return convertToDict(result) + + + +if __name__ == "__main__": + print(readRAS()) diff --git a/README.md b/README.md index a24a44a..3c26552 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,9 @@ -# monitoring +# Hardware monitoring suite -Monitoring suite for system and other stuff \ No newline at end of file +Set of python scripts for asserting health and stats of a linux system + +# Configuration + +See config-example.json + +test diff --git a/SMART.py b/SMART.py new file mode 100644 index 0000000..dff21cc --- /dev/null +++ b/SMART.py @@ -0,0 +1,65 @@ +import json +import subprocess + +def getDevices(): + result = subprocess.run(['smartctl', '--scan-open', '-j'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + devices = json.loads(result) + + return devices + +def getAttributes(device): + attributes = {} + attributes["data"] = {} + + result = subprocess.run(['smartctl', device["name"], '-a', '-j'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + data = json.loads(result) + + if "NVMe" == device["protocol"]: + attributes["sector_size"] = data["nvme_namespaces"][0]["formatted_lba_size"] + attributes["serial_number"] = data["serial_number"] + attributes["type"] = "NVME" + for attribute in data["nvme_smart_health_information_log"]: + attributes["data"][attribute] = data["nvme_smart_health_information_log"][attribute] + attributes["bytes_written"] = attributes["data"]["data_units_written"] * attributes["sector_size"] * 1000 + elif "ATA" == device["protocol"]: + attributes["sector_size"] = data["physical_block_size"] + attributes["serial_number"] = data["serial_number"] + attributes["type"] = "ATA" + for attribute in data["ata_smart_attributes"]["table"]: + attributes["data"][attribute["name"]] = {} + try: + attributes["data"][attribute["name"]]["raw"] = int(attribute["raw"]["string"].split(' ')[0]) + except: + attributes["data"][attribute["name"]]["raw"] = -1 + attributes["data"][attribute["name"]]["id"] = int(attribute["id"]) + attributes["data"][attribute["name"]]["value"] = int(attribute["value"]) + attributes["data"][attribute["name"]]["worst"] = int(attribute["worst"]) + attributes["data"][attribute["name"]]["thr"] = int(attribute["thresh"]) + try: + attributes["bytes_written"] = attributes["data"]["Total_LBAs_Written"]["raw"] * attributes["sector_size"] + except: + attributes["bytes_written"] = -1 + else: + pass + + return attributes + +def getAllDeviceAttributes(): + devices = getDevices() + attributes = {} + + for device in devices["devices"]: + attributes[device["name"]] = getAttributes(device) + + return attributes + +if __name__ == "__main__": + smart = getAllDeviceAttributes() + + print(smart) + + for device in smart: + print(smart[device]["serial_number"]) diff --git a/config-example.json b/config-example.json new file mode 100644 index 0000000..514dcd1 --- /dev/null +++ b/config-example.json @@ -0,0 +1,11 @@ +{ + "modules": ["sysinfo","docker","procMon"], + "volumes": { + "zfs": [], + "non_zfs": ["sda"] + }, + "network": { + "nics": ["ens18"] + }, + "gpu": "nVidia" +} diff --git a/cpuinfo.py b/cpuinfo.py new file mode 100644 index 0000000..c4be497 --- /dev/null +++ b/cpuinfo.py @@ -0,0 +1,37 @@ +import subprocess +import json + +def getFrequency(): + result = subprocess.run(['cat', '/proc/cpuinfo'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8').split("\n") + + freq = {} + + index = 0 + for line in result: + if "cpu MHz" in line: + line = line.split(":") + freq[str(index)] = float(line[-1]) + index += 1 + return freq + +def getCpuInfo(): + result = subprocess.run(['lscpu', '-J'], stdout=subprocess.PIPE) + result = json.loads(result.stdout.decode('utf-8')) + + cpuinfo = {} + + for item in result["lscpu"]: + if 'Vendor ID' in item['field']: + cpuinfo["vendor"] = item["data"] + if 'Model name' in item['field']: + cpuinfo["model"] = item["data"] + if item['field'] == 'CPU(s):': + cpuinfo["cpus"] = item["data"] + cpuinfo["frequency"] = getFrequency() + + return cpuinfo + +if __name__ == "__main__": +# print(getFrequency()) + print(getCpuInfo()) diff --git a/docker.py b/docker.py new file mode 100644 index 0000000..ac1247d --- /dev/null +++ b/docker.py @@ -0,0 +1,138 @@ +import subprocess +import json + +def health(): + result = subprocess.run(['docker', 'ps', '-a', '--format', 'json', '--no-trunc'], stdout=subprocess.PIPE) + result = str(result.stdout)[:-1].replace("b'", "").replace('\\"', '').replace('\\', '').replace("u003e", ">").split('{"Command"') + + ret_dict = {} + + for container in result: + if container != "": + j = '{"Command"'+container[:-1] + data = json.loads(j) + ret_dict[data["Names"]] = {} + if data["State"] == "running": + ret_dict[data["Names"]]["status"] = 1 + else: + ret_dict[data["Names"]]["status"] = 0 + if "unhealthy" in data["Status"]: + ret_dict[data["Names"]]["health"] = 3 + elif "Restarting" in data["Status"]: + ret_dict[data["Names"]]["health"] = 2 + elif "healthy" in data["Status"]: + ret_dict[data["Names"]]["health"] = 0 + elif "starting" in data["Status"]: + ret_dict[data["Names"]]["health"] = 1 + else: + ret_dict[data["Names"]]["health"] = -1 + + + return ret_dict + +def _sizeConv(value): + number = "" + for d in value: + if d.isdigit(): + number += d + elif d == '.': + number += d + else: + continue + number = float(number) + if "TB" in value: + return number * 10**12 + elif "GB" in value: + return number * 10**9 + elif "MB" in value: + return number * 10**6 + elif "KB" in value: + return number * 10**3 + else: + return number + +def getSize(): + result = subprocess.run(['docker', 'system', 'df', '--format', 'json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split('\n') + + size = {} + + for line in result: + a = json.loads(line) + size[a["Type"]] = {} + size[a["Type"]]["used"] = _sizeConv(a["Size"]) + size[a["Type"]]["reclaimable"] = _sizeConv(a["Reclaimable"].split(" ")[0]) + size[a["Type"]]["count_total"] = a["TotalCount"] + size[a["Type"]]["count_active"] = a["Active"] + + return size + +def getInfo(): + docker = {} + + result = subprocess.run(['docker', 'system', 'info', '--format', 'json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1] + info = json.loads(result) + + docker["version"] = info["ServerVersion"] + + result = subprocess.run(['docker', 'system', 'df', '-v', '--format', 'json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + info = json.loads(result) + + docker["images"] = {} + docker["containers"] = {} + docker["volumes"] = {} + docker["buildcache"] = {} + + for image in info["Images"]: + docker["images"][image["ID"]] = {} + docker["images"][image["ID"]]["containers"] = image["Containers"] + docker["images"][image["ID"]]["created"] = image["CreatedSince"] + docker["images"][image["ID"]]["repository"] = image["Repository"] + docker["images"][image["ID"]]["size"] = _sizeConv(image["Size"]) + docker["images"][image["ID"]]["unique_size"] = image["UniqueSize"] + + for container in info["Containers"]: + docker["containers"][container["ID"]] = {} + docker["containers"][container["ID"]]["created"] = container["CreatedAt"] + docker["containers"][container["ID"]]["image"] = container["Image"] + docker["containers"][container["ID"]]["volumes"] = container["LocalVolumes"] + docker["containers"][container["ID"]]["mounts"] = container["Mounts"] + try: + docker["containers"][container["ID"]]["name"] = container["Names"] + except: + docker["containers"][container["ID"]]["name"] = "" + try: + docker["containers"][container["ID"]]["networks"] = container["Networks"] + except: + docker["containers"][container["ID"]]["networks"] = "" + docker["containers"][container["ID"]]["runtime"] = container["RunningFor"] + docker["containers"][container["ID"]]["state"] = container["State"] + docker["containers"][container["ID"]]["size"] = _sizeConv(container["Size"]) + docker["containers"][container["ID"]]["status"] = container["Status"] + + for volume in info["Volumes"]: + docker["volumes"][volume["Name"]] = {} + docker["volumes"][volume["Name"]]["driver"] = volume["Driver"] + docker["volumes"][volume["Name"]]["links"] = volume["Links"] + docker["volumes"][volume["Name"]]["mountpoint"] = volume["Mountpoint"] + docker["volumes"][volume["Name"]]["size"] = _sizeConv(volume["Size"]) + + for build in info["BuildCache"]: + docker["buildcache"][build["ID"]] = {} + docker["buildcache"][build["ID"]]["type"] = build["CacheType"] + docker["buildcache"][build["ID"]]["created"] = build["CreatedSince"] + docker["buildcache"][build["ID"]]["in_use"] = build["InUse"] + docker["buildcache"][build["ID"]]["last_use"] = build["LastUsedSince"] + docker["buildcache"][build["ID"]]["shared"] = build["Shared"] + docker["buildcache"][build["ID"]]["size"] = _sizeConv(build["Size"]) + docker["buildcache"][build["ID"]]["use_count"] = build["UsageCount"] + + return docker + +if __name__ == "__main__": + print(json.dumps(getSize())) + print(json.dumps(getInfo())) + + #print(health()) diff --git a/gpu.py b/gpu.py new file mode 100644 index 0000000..e2c1690 --- /dev/null +++ b/gpu.py @@ -0,0 +1,266 @@ +import xmltodict +import json +import subprocess +import os +import re + +def _run_nvidia_smi(): + result = subprocess.run(['nvidia-smi', '-q', '-x'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + return result + +def _run_rocm_smi(): + try: + result = subprocess.run(['rocm-smi', '-a', '--json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + except: + result = subprocess.run(['/opt/rocm/bin/rocm-smi', '-a', '--json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + result = json.loads(result) + + return result + +def _read_file_number(path): + with open(path, "r") as f: + number = f.read().split("\n")[0] + return float(number) + +def _intel(): + from time import sleep + stats = {} + + with open("/sys/class/drm/card0/device/device", "r") as f: + stats["uuid"] = f.read().split("\n")[0] + + result = subprocess.run(['lshw', '-c', 'display', '-json'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + a = json.loads(result) + + result = subprocess.Popen(['intel_gpu_top', '-J'], stdout=subprocess.PIPE) + sleep(0.2) + result.kill() + c,d = result.communicate() + + c = c.decode('utf-8')[3:] + c = json.loads(c) + + stats["max_freq"] = _read_file_number("/sys/class/drm/card0/gt_max_freq_mhz") * 10**6 + stats["min_freq"] = _read_file_number("/sys/class/drm/card0/gt_min_freq_mhz") * 10**6 + stats["cur_freq"] = _read_file_number("/sys/class/drm/card0/gt_cur_freq_mhz") * 10**6 + stats["power"] = c["power"]["GPU"] + stats["engine_3d"] = c["engines"]["Render/3D"]["busy"] + stats["engine_video"] = c["engines"]["Video"]["busy"] + stats["usage"] = (stats["engine_3d"] + stats["engine_video"]) / 2 + stats["model"] = a[0]["product"] + stats["driver"] = a[0]["configuration"]["driver"] + + return stats + +def _getAmdGpuMemSize(): + devs = os.listdir("/sys/class/drm/") + cards = {} + + for i in devs: + card = re.findall("card[0-9]",i) + if card != []: + try: + with open("/sys/class/drm/"+card[0]+"/device/mem_info_vram_total", "r") as f: + mem = f.read()[:-1] + with open("/sys/class/drm/"+card[0]+"/device/device", "r") as f: + device = f.read()[:-1] + cards[device] = mem + except: + pass + + return cards + +def _getAmdGpuMaxFanspeed(): + result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + devices = json.loads(result) + for device in devices: + if "amdgpu" in device: + for entry in devices[device]: + if "fan" in entry: + for rpm in devices[device][entry]: + if "max" in rpm: + return devices[device][entry][rpm] + return 1 + +def readGpu(vendor="nVidia"): + data = {} + data["about"] = {} + data["gpu"] = {} + + if vendor == "nVidia": + whitelist = ["driver_version","cuda_version","product_name","uuid","vbios_version","fan_speed","performance_state"] #amd: valid?,not_valid,valid,valid,valid,valid,not_valid + + smi = xmltodict.parse(_run_nvidia_smi()) + uuid = smi["nvidia_smi_log"]["gpu"]["uuid"] + + data["gpu"][uuid] = {} + data["gpu"][uuid]["throttle"] = {} + data["gpu"][uuid]["util"] = {} + + for i in smi["nvidia_smi_log"]: + if i in whitelist: + data["about"][i] = smi["nvidia_smi_log"][i] + + for i in smi["nvidia_smi_log"]["gpu"]: + if i in whitelist: + data["gpu"][uuid][i] = smi["nvidia_smi_log"]["gpu"][i] + + for i in smi["nvidia_smi_log"]["gpu"]["clocks_event_reasons"]: + active = 1 + if "Not Active" in smi["nvidia_smi_log"]["gpu"]["clocks_event_reasons"][i]: + active = 0 + data["gpu"][uuid]["throttle"][i.replace("clocks_event_reason_", "")] = active + + data["gpu"][uuid]["memory_total"] = float(smi["nvidia_smi_log"]["gpu"]["fb_memory_usage"]["total"].split(" ")[0]) + data["gpu"][uuid]["memory_used"] = float(smi["nvidia_smi_log"]["gpu"]["fb_memory_usage"]["used"].split(" ")[0]) + data["gpu"][uuid]["util"] = smi["nvidia_smi_log"]["gpu"]["utilization"] + data["gpu"][uuid]["temp"] = float(smi["nvidia_smi_log"]["gpu"]["temperature"]["gpu_temp"].split(" ")[0]) + data["gpu"][uuid]["power"] = float(smi["nvidia_smi_log"]["gpu"]["gpu_power_readings"]["instant_power_draw"].split(" ")[0]) + data["gpu"][uuid]["power_limit"] = float(smi["nvidia_smi_log"]["gpu"]["gpu_power_readings"]["current_power_limit"].split(" ")[0]) + data["gpu"][uuid]["gpu_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["graphics_clock"].split(" ")[0]) + data["gpu"][uuid]["mem_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["mem_clock"].split(" ")[0]) + data["gpu"][uuid]["sm_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["sm_clock"].split(" ")[0]) + data["gpu"][uuid]["video_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["video_clock"].split(" ")[0]) + data["gpu"][uuid]["gpu_max_clock"] = float(smi["nvidia_smi_log"]["gpu"]["max_clocks"]["graphics_clock"].split(" ")[0]) + data["gpu"][uuid]["mem_max_clock"] = float(smi["nvidia_smi_log"]["gpu"]["max_clocks"]["mem_clock"].split(" ")[0]) + + data["gpu"][uuid]["fan_speed"] = float(data["gpu"][uuid]["fan_speed"].split(" ")[0]) + for i in data["gpu"][uuid]["util"]: + data["gpu"][uuid]["util"][i] = float(data["gpu"][uuid]["util"][i].split(" ")[0]) + + elif vendor == "AMD": + smi = _run_rocm_smi() + memsize = _getAmdGpuMemSize() + + for card in smi: + if card == "system": + data["about"]["driver_version"] = smi["system"]["Driver version"] + else: + data["gpu"][smi[card]["Device ID"]] = {} + try: + data["gpu"][smi[card]["Device ID"]]["fan_speed"] = 100 * float(smi[card]["current_fan_speed (rpm)"]) / _getAmdGpuMaxFanspeed() + except: + data["gpu"][smi[card]["Device ID"]]["fan_speed"] = 0.0 + data["gpu"][smi[card]["Device ID"]]["vbios_version"] = smi[card]["VBIOS version"] + data["gpu"][smi[card]["Device ID"]]["product_name"] = smi[card]["Device Name"] + data["gpu"][smi[card]["Device ID"]]["uuid"] = smi[card]["Unique ID"] + data["gpu"][smi[card]["Device ID"]]["memory_total"] = int(memsize[smi[card]["Device ID"]]) + data["gpu"][smi[card]["Device ID"]]["memory_used"] = (int(memsize[smi[card]["Device ID"]]) / 100) * float(smi[card]["GPU Memory Allocated (VRAM%)"]) + data["gpu"][smi[card]["Device ID"]]["temp"] = float(smi[card]["Temperature (Sensor edge) (C)"]) + try: + data["gpu"][smi[card]["Device ID"]]["power"] = float(smi[card]["Average Graphics Package Power (W)"]) + except: + try: + data["gpu"][smi[card]["Device ID"]]["power"] = float(smi[card]["Current Socket Graphics Package Power (W)"]) + except: + data["gpu"][smi[card]["Device ID"]]["power"] = 0.0 + try: + data["gpu"][smi[card]["Device ID"]]["power_limit"] = float(smi[card]["Max Graphics Package Power (W)"]) + except: + data["gpu"][smi[card]["Device ID"]]["power_limit"] = 65.0 + data["gpu"][smi[card]["Device ID"]]["gpu_clock"] = float(smi[card]["sclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6 + data["gpu"][smi[card]["Device ID"]]["mem_clock"] = float(smi[card]["mclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6 + try: + data["gpu"][smi[card]["Device ID"]]["sm_clock"] = float(smi[card]["current_dclk0 (MHz)"]) * 10**6 + data["gpu"][smi[card]["Device ID"]]["video_clock"] = float(smi[card]["current_vclk0 (MHz)"]) * 10**6 + except: + data["gpu"][smi[card]["Device ID"]]["sm_clock"] = 0.0 + data["gpu"][smi[card]["Device ID"]]["video_clock"] = 0.0 + data["gpu"][smi[card]["Device ID"]]["gpu_max_clock"] = float(smi[card]["Valid sclk range"].replace("Mhz","").split(" - ")[1]) * 10**6 + try: + data["gpu"][smi[card]["Device ID"]]["mem_max_clock"] = float(smi[card]["Valid mclk range"].replace("Mhz","").split(" - ")[1]) * 10**6 + except: + data["gpu"][smi[card]["Device ID"]]["mem_max_clock"] = float(smi[card]["mclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6 + + util = {} + util["gpu_util"] = float(smi[card]["GPU use (%)"]) + util["memory_util"] = float(smi[card]["GPU Memory Allocated (VRAM%)"]) + + data["gpu"][smi[card]["Device ID"]]["throttle"] = {} + try: + data["gpu"][smi[card]["Device ID"]]["throttle"]["status"] = float(smi[card]["throttle_status"]) + except: + data["gpu"][smi[card]["Device ID"]]["throttle"]["status"] = 1 + + data["gpu"][smi[card]["Device ID"]]["util"] = util + data["gpu"][smi[card]["Device ID"]]["performance_state"] = "N/A" + + elif vendor == "Intel": + try: + intel = _intel() + except: + intel = {'uuid': 'Error', 'max_freq': 0.0, 'min_freq': 0.0, 'cur_freq': 0.0, 'power': 0.0, 'engine_3d': 0.0, 'engine_video': 0.0, 'usage': 0.0, 'model': 'Error', 'driver': 'Error'} + + uuid = intel["uuid"] + data["gpu"][uuid] = {} + data["gpu"][uuid]["throttle"] = {} + data["gpu"][uuid]["util"] = {} + + data["about"]["driver_version"] = intel["driver"] + data["gpu"][uuid]["throttle"]["status"] = 0 + util = {} + util["gpu_util"] = intel["usage"] + util["memory_util"] = 0 + data["gpu"][uuid]["util"] = util + data["gpu"][uuid]["fan_speed"] = 0 + data["gpu"][uuid]["vbios_version"] = "0.0.0" + data["gpu"][uuid]["product_name"] = intel["model"] + data["gpu"][uuid]["uuid"] = uuid + data["gpu"][uuid]["memory_total"] = 1 + data["gpu"][uuid]["memory_used"] = 0 + data["gpu"][uuid]["temp"] = 0 + data["gpu"][uuid]["power"] = intel["power"] + data["gpu"][uuid]["power_limit"] = 15 + data["gpu"][uuid]["gpu_clock"] = intel["cur_freq"] + data["gpu"][uuid]["mem_clock"] = 0 + data["gpu"][uuid]["sm_clock"] = 0 + data["gpu"][uuid]["video_clock"] = 0 + data["gpu"][uuid]["mem_max_clock"] = 1 + data["gpu"][uuid]["gpu_max_clock"] = intel["max_freq"] + data["gpu"][uuid]["performance_state"] = "N/A" + + else: + uuid = "unsupported" + data["gpu"][uuid] = {} + data["gpu"][uuid]["throttle"] = {} + data["gpu"][uuid]["util"] = {} + + data["about"]["driver_version"] = "Unknown" + data["gpu"][uuid]["throttle"]["status"] = 0 + util = {} + util["gpu_util"] = 0 + util["memory_util"] = 0 + data["gpu"][uuid]["util"] = util + data["gpu"][uuid]["fan_speed"] = 0 + data["gpu"][uuid]["vbios_version"] = "0.0.0" + data["gpu"][uuid]["product_name"] = "Unknown" + data["gpu"][uuid]["uuid"] = uuid + data["gpu"][uuid]["memory_total"] = 1 + data["gpu"][uuid]["memory_used"] = 0 + data["gpu"][uuid]["temp"] = 0 + data["gpu"][uuid]["power"] = 0 + data["gpu"][uuid]["power_limit"] = 1 + data["gpu"][uuid]["gpu_clock"] = 0 + data["gpu"][uuid]["mem_clock"] = 0 + data["gpu"][uuid]["sm_clock"] = 0 + data["gpu"][uuid]["video_clock"] = 0 + data["gpu"][uuid]["mem_max_clock"] = 1 + data["gpu"][uuid]["gpu_max_clock"] = 1 + data["gpu"][uuid]["performance_state"] = "N/A" + + return data + +if __name__ == "__main__": + #print(json.dumps(_run_rocm_smi(), indent=1)) + #print(json.dumps(_getAmdGpuMemSize(), indent=1)) + #print(json.dumps(readGpu(vendor="AMD"), indent=1)) + _getAmdGpuMaxFanspeed() diff --git a/hardware-monitor.service b/hardware-monitor.service new file mode 100644 index 0000000..6a0c974 --- /dev/null +++ b/hardware-monitor.service @@ -0,0 +1,8 @@ +[Unit] +Description=Hardware monitor service + +[Service] +ExecStart=/opt/monitoring/bin/python /opt/monitoring/main.py + +[Install] +WantedBy=multi-user.target diff --git a/intelPower.py b/intelPower.py new file mode 100644 index 0000000..dbfa671 --- /dev/null +++ b/intelPower.py @@ -0,0 +1,115 @@ +import logging +import os.path +import argparse +from itertools import count +from struct import unpack +from time import sleep +from warnings import warn +from datetime import datetime + +logger = logging.getLogger("intel-master") + +class IntelPower: + RAPL_MSR_POWER_UNIT = 0x606 + RAPL_MSR_ENERGY = 0x611 + RAPL_MSR_PP0_ENERGY = 0x639 + + def __init__(self): + self._energy_unit = self._get_power_units() + self._package_topology = self._detect_physical_package_topology() + self._cores = list(self._package_topology.keys()) + self._cores = sorted(self._cores) + self.timestamp = 0 + self.package_energy = -1 + self.core_energy = -1 + + def _read_msr(self, cpu_id, offset): + msr_file = "/dev/cpu/{}/msr".format(cpu_id) + try: + with open(msr_file, "rb", buffering=8192) as f: + f.seek(offset) + return self._decode_int64(f.read(8)) + except PermissionError: + raise PermissionError("root privilege is required to read model-specific registers") + except FileNotFoundError: + raise FileNotFoundError("msr driver is not loaded, try \"sudo modprobe msr\" to load msr module") + + @staticmethod + def _decode_int64(buffer): + return unpack("q", buffer)[0] + + @staticmethod + def _detect_physical_package_topology(): + cpu_package_mapping = {} + for cpu_id in count(): + filename = "/sys/devices/system/cpu/cpu{}/topology/physical_package_id".format(cpu_id) + if os.path.isfile(filename): + with open(filename, "r") as f: + package_id = int(f.read()) + logger.debug("detected cpu {} in socket {}".format(cpu_id, package_id)) + cpu_package_mapping[cpu_id] = package_id + else: + return cpu_package_mapping + + def _get_power_units(self): + power_unit = self._read_msr(0, self.RAPL_MSR_POWER_UNIT) + raw_unit = (power_unit >> 8) & 0x1F; + logger.debug("CPU energy unit is 1/2^{}".format(power_unit)) + power_unit = 0.5 ** raw_unit + return power_unit + + def _read_package_energy(self, cpu_id): + energy = self._read_msr(cpu_id, self.RAPL_MSR_ENERGY) + logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit)) + return energy + + def _read_core_energy(self, cpu_id): + energy = self._read_msr(cpu_id, self.RAPL_MSR_PP0_ENERGY) + logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit)) + return energy + + def _calc_power_wtime(self, before, after, duration): + return (after - before) * self._energy_unit / duration + + def measure_nonblocking(self): + timestamp = datetime.now() + package_energy = {c: self._read_package_energy(c) for c in self._cores} + core_energy = {c: self._read_core_energy(c) for c in self._cores} + + if self.package_energy != -1: + time_delta = (timestamp - self.timestamp).total_seconds() + package_power = {c: self._calc_power_wtime(self.package_energy[c], package_energy[c], time_delta) for c in self._cores} + core_power = {c: self._calc_power_wtime(self.core_energy[c], core_energy[c], time_delta) for c in self._cores} + else: + for c in self._cores: + package_power = {c: 0 for c in self._cores} + core_power = {c: 0 for c in self._cores} + + self.timestamp = timestamp + self.package_energy = package_energy + self.core_energy = core_energy + + avg_pp = 0 + avg_cp = 0 + for c in self._cores: + avg_pp += package_power[c] + avg_cp += core_power[c] + avg_pp = avg_pp / len(self._cores) + avg_cp = avg_cp / len(self._cores) + + return avg_pp, avg_cp + + def read(self): + return self._read_msr(1, self.RAPL_MSR_POWER_UNIT) + +if __name__ == "__main__": + print(IntelPower().read()) + print(IntelPower()._detect_physical_package_topology()) + + ip = IntelPower() + + pp, cp = ip.measure_nonblocking() + print(pp, cp) + sleep(1) + pp, cp = ip.measure_nonblocking() + print(pp, cp) diff --git a/kvmSensors.py b/kvmSensors.py new file mode 100644 index 0000000..d3a618e --- /dev/null +++ b/kvmSensors.py @@ -0,0 +1,65 @@ +import subprocess + +def fans(data): + if "FAN" in data[0]: + fandata = {} + fandata["id"] = data[0] + fandata["rpm"] = data[1] + fandata["status"] = data[3] + return fandata + +def voltage(data): + if "VOLT" in data[0]: + voltdata = {} + voltdata["id"] = data[0] + voltdata["voltage"] = data[1] + voltdata["status"] = data[3] + + return voltdata + +def temp(data): + if "TEMP" in data[0]: + tempdata = {} + tempdata["id"] = data[0] + tempdata["temp"] = data[1] + tempdata["status"] = data[3] + + return tempdata + +def readSensors(): + result = subprocess.run(['ipmitool', 'sensor'], stdout=subprocess.PIPE) + result = str(result.stdout).replace(" ", "").replace("b'", "").replace("'", "").split("\\n") + + retdata = {} + fandata = {} + tempdata = {} + voltdata = {} + fanindex = 0 + voltindex = 0 + tempindex = 0 + + for sensor in result: + data = sensor.split("|") + try: + if data[1] != "na": + if data[1] != "0x0": +# print(data) + if fans(data) != None: + fandata[str(fanindex)] = fans(data) + fanindex += 1 + if voltage(data) != None: + voltdata[str(voltindex)] = voltage(data) + voltindex += 1 + if temp(data) != None: + tempdata[str(tempindex)] = temp(data) + tempindex += 1 + + except: # Exception as e: print(e) + pass + retdata["fans"] = fandata + retdata["volt"] = voltdata + retdata["temp"] = tempdata + return retdata + +if __name__ == "__main__": + print(readSensors()) diff --git a/main.py b/main.py new file mode 100644 index 0000000..b84c34e --- /dev/null +++ b/main.py @@ -0,0 +1,690 @@ +from time import sleep +import datetime +import sys +import json +import os + +import promMon +import mon_pkg_update + +#===INIT======================================================================== +filedir = os.path.dirname(os.path.abspath(sys.argv[0])) + +vols_to_scan = {} +vols_to_scan["zfs"] = [] +vols_to_scan["non_zfs"] = [] +nics_to_scan = [] +SMART_data_update_period_seconds = 60 +last_SMART_read_timestamp = 0 +packages_data_update_period_seconds = 3600 +last_packages_read_timestamp = 0 +docker_data_update_period_seconds = 3600 +last_docker_read_timestamp = 0 +update_scan_period_seconds = 300 +last_update_scan_timestamp = 0 +update_from_git_tags = True +gpu_vendor = "nVidia" + +# load config.json file +try: + with open(filedir+"/config.json", "r") as f: + config = json.loads(f.read()) + for module in config["modules"]: + if "ryzenPower" in module: + import ryzenPower + if "kvmSensors" in module: + import kvmSensors as ks + if "sysinfo" in module: + import sysinfo as si + if "docker" in module: + import docker as do + if "SMART" in module: + import SMART as sm + if "sensors" in module: + import sensors as lmsn + if "intelPower" in module: + import intelPower + if "procMon" in module: + import procMon as pr + if "RAS" in module: + import RAS + if "cpuinfo" in module: + import cpuinfo as cinfo + if "gpu" in module: + import gpu + if "packages" in module: + import packages + try: + if len(config["volumes"]["zfs"]) != 0: + for i in config["volumes"]["zfs"]: + vols_to_scan["zfs"].append(i) + if len(config["volumes"]["non_zfs"]) != 0: + for i in config["volumes"]["non_zfs"]: + vols_to_scan["non_zfs"].append(i) + except: + print("No volumes object detected") + try: + if len(config["network"]["nics"]) != 0: + for nic in config["network"]["nics"]: + nics_to_scan.append(nic) + except: + print("No network object detected") + try: + gpu_vendor = config["gpu"] + except: + pass +except: + print("No config file found, exitting") + exit(255) + + +proc_last = 0 + +if "promMon" not in sys.modules: + print("Prometheus exporter helper not inported, exitting") + exit(1) + +pm = promMon.prometheus(port=9339, name="hw-monitor") + +pm.add_monitor("self_loop_time", "us") +pm.add_monitor("version", "", tags=("version",)) + +if "docker" in sys.modules: + pm.add_monitor("docker_version", "") + pm.add_monitor("docker_overall_info", "", tags=("module","type")) + pm.add_monitor("docker_overall_size", "B", tags=("module","type")) + pm.add_monitor("docker_images", "", tags=("id","containers","created","repository","unique_size")) + pm.add_monitor("docker_containers", "", tags=("id","created","image","volumes","mounts","name","networks","runtime","state","status")) + pm.add_monitor("docker_volumes", "", tags=("name","driver","links","mountpoint")) + pm.add_monitor("docker_build_cache", "", tags=("id","type","created","in_use","last_use","shared","use_count")) + +if "ryzenPower" in sys.modules: + pm.add_monitor("package_power", "W") + pm.add_monitor("core_total", "W") + pm.add_monitor("core", "W") + rp = ryzenPower.RyzenPower() + +if "intelPower" in sys.modules: + ip = intelPower.IntelPower() + +if "kvmSensors" in sys.modules: + pm.add_monitor("fan_rpm", "RPM") + pm.add_monitor("fan_ok", "") + pm.add_monitor("temp_celsius", "C") + pm.add_monitor("temp_ok", "") + pm.add_monitor("voltage", "V") + pm.add_monitor("voltage_ok", "") + +if "cpuinfo" in sys.modules: + pm.add_monitor("cpu_info", "", tags=("vendor","model","cpus")) + +if "sysinfo" in sys.modules: + pm.add_monitor("disk_read", "B/s") + pm.add_monitor("disk_write", "B/s") + pm.add_monitor("disk_io_read", "iops") + pm.add_monitor("disk_io_write", "iops") + pm.add_monitor("disk_io_read_time", "ms") + pm.add_monitor("disk_io_write_time", "ms") + pm.add_monitor("disk_io_read_merged", "") + pm.add_monitor("disk_io_write_merged", "") + pm.add_monitor("disk_busy", "ms") + pm.add_monitor("cpu_count", "") + pm.add_monitor("cpu_frequency", "Hz") + pm.add_monitor("cpu_usage", "%") + pm.add_monitor("uptime", "s") + pm.add_monitor("system_info", "", tags=("hostname","kernel","board")) + pm.add_monitor("ip_addrs", "", tags=("interface","ip",)) + pm.add_monitor("user_sessions", "") + pm.add_monitor("users", "", tags=("user","from")) + +if "SMART" in sys.modules: + pm.add_monitor("smart_attributes", "", tags=("serial","device","attribute","id","value","thres","worst","raw")) + +if "gpu" in sys.modules: + pm.add_monitor("gpu_info", "", tags=("name","vbios","driver","pstate","uuid")) + pm.add_monitor("gpu_util", "%", tags=("name","uuid","stat")) + pm.add_monitor("gpu_throttle", "", tags=("name","uuid","stat")) + pm.add_monitor("gpu_memory_used", "MB", tags=("name","uuid")) + pm.add_monitor("gpu_memory_total", "MB", tags=("name","uuid")) + pm.add_monitor("gpu_power", "W", tags=("name","uuid")) + pm.add_monitor("gpu_power_limit", "W", tags=("name","uuid")) + pm.add_monitor("gpu_temp", "C", tags=("name","uuid")) + pm.add_monitor("gpu_fan_speed", "%", tags=("name","uuid")) + pm.add_monitor("gpu_gpu_clock", "Hz", tags=("name","uuid")) + pm.add_monitor("gpu_mem_clock", "Hz", tags=("name","uuid")) + pm.add_monitor("gpu_sm_clock", "Hz", tags=("name","uuid")) + pm.add_monitor("gpu_video_clock", "Hz", tags=("name","uuid")) + pm.add_monitor("gpu_gpu_clock_max", "Hz", tags=("name","uuid")) + pm.add_monitor("gpu_mem_clock_max", "Hz", tags=("name","uuid")) + +if "packages" in sys.modules: + pm.add_monitor("update_pkg_count", "", tags=("package_mgr",)) + pm.add_monitor("update_pkg_updatable", "", tags=("package","version","repository")) + pm.add_monitor("installed_pkg_count", "", tags=("package_mgr",)) + pm.add_monitor("installed_pkgs", "", tags=("package","version","repository")) + pm.add_monitor("installed_pkg_size", "", tags=("package_mgr",)) + +def self_monitoring(name, start): + end = datetime.datetime.now() + task_time = end - start + pm.monitor("self_loop_time", (name,), task_time.microseconds + (task_time.seconds * 1000000)) + + return datetime.datetime.now() + +while True: + mon_time = 0 + loop_start = datetime.datetime.now() + +#===VARIABLE INIT=============================================================== + + if "ryzenPower" in sys.modules: + package, cores = rp.measure_nonblocking() + mon_time = self_monitoring("zenpower", loop_start if mon_time == 0 else mon_time) + + if "kvmSensors" in sys.modules: + kvm = ks.readSensors() + mon_time = self_monitoring("kvm", loop_start if mon_time == 0 else mon_time) + + if "sysinfo" in sys.modules: + disks = si.getDisk() + mon_time = self_monitoring("disks", loop_start if mon_time == 0 else mon_time) + cpu = si.getCPU() + mon_time = self_monitoring("cpu", loop_start if mon_time == 0 else mon_time) + memory = si.getMemory() + mon_time = self_monitoring("memory", loop_start if mon_time == 0 else mon_time) + try: + partitions = si.getPartitions() + mon_time = self_monitoring("partitions", loop_start if mon_time == 0 else mon_time) + except: + partitions = 0 + try: + zfs = si.getZFS() + mon_time = self_monitoring("zfs", loop_start if mon_time == 0 else mon_time) + except: + zfs = 0 + uptime = si.getUptime() + mon_time = self_monitoring("uptime", loop_start if mon_time == 0 else mon_time) + network = si.getNetwork() + ipaddrs = si.getIP() + mon_time = self_monitoring("network", loop_start if mon_time == 0 else mon_time) + systeminfo = si.getSysInfo() + mon_time = self_monitoring("systeminfo", loop_start if mon_time == 0 else mon_time) + users = si.users() + mon_time = self_monitoring("users", loop_start if mon_time == 0 else mon_time) + + if "SMART" in sys.modules: + if last_SMART_read_timestamp == 0: + smart = sm.getAllDeviceAttributes() + last_SMART_read_timestamp = datetime.datetime.now() + else: + timedelta = (datetime.datetime.now() - last_SMART_read_timestamp).total_seconds() + if timedelta >= SMART_data_update_period_seconds: + smart = sm.getAllDeviceAttributes() + last_SMART_read_timestamp = datetime.datetime.now() + mon_time = self_monitoring("smart", loop_start if mon_time == 0 else mon_time) + + if "docker" in sys.modules: + containers = do.health() + try: + if last_docker_read_timestamp == 0: + docker_info = do.getInfo() + docker_size = do.getSize() + last_docker_read_timestamp = datetime.datetime.now() + else: + timedelta = (datetime.datetime.now() - last_docker_read_timestamp).total_seconds() + if timedelta >= docker_data_update_period_seconds: + docker_info = do.getInfo() + docker_size = do.getSize() + last_docker_read_timestamp = datetime.datetime.now() + except Exception as e: + print(e) + mon_time = self_monitoring("containers", loop_start if mon_time == 0 else mon_time) + + if "sensors" in sys.modules: + temperatures = lmsn.getTemps() + mon_time = self_monitoring("temperatures", loop_start if mon_time == 0 else mon_time) + fans = lmsn.getFans() + mon_time = self_monitoring("fans", loop_start if mon_time == 0 else mon_time) + voltages = lmsn.getVoltages() + mon_time = self_monitoring("voltage", loop_start if mon_time == 0 else mon_time) + + if "intelPower" in sys.modules: + package, cores = ip.measure_nonblocking() + mon_time = self_monitoring("cpu_power", loop_start if mon_time == 0 else mon_time) + + if "RAS" in sys.modules: + ras = RAS.readRAS() + mon_time = self_monitoring("ras", loop_start if mon_time == 0 else mon_time) + + if "procMon" in sys.modules: + proc = pr.exportProcesses(proc_last) + proc_last = proc + mon_time = self_monitoring("proc", loop_start if mon_time == 0 else mon_time) + + if "cpuinfo" in sys.modules: + cpuinfo = cinfo.getCpuInfo() + mon_time = self_monitoring("cpuinfo", loop_start if mon_time == 0 else mon_time) + + if "gpu" in sys.modules: + gpuinfo = gpu.readGpu(gpu_vendor) + mon_time = self_monitoring("gpu", loop_start if mon_time == 0 else mon_time) + + if "packages" in sys.modules: + if last_packages_read_timestamp == 0: + up = packages.getPackages() + last_packages_read_timestamp = datetime.datetime.now() + else: + timedelta = (datetime.datetime.now() - last_packages_read_timestamp).total_seconds() + if timedelta >= packages_data_update_period_seconds: + up = packages.getPackages() + last_packages_read_timestamp = datetime.datetime.now() + mon_time = self_monitoring("packages", loop_start if mon_time == 0 else mon_time) + + if last_update_scan_timestamp == 0: + version = mon_pkg_update.getCurrentTag() + if update_from_git_tags: + mon_pkg_update.update() + last_update_scan_timestamp = datetime.datetime.now() + else: + timedelta = (datetime.datetime.now() - last_update_scan_timestamp).total_seconds() + if timedelta >= update_scan_period_seconds: + version = mon_pkg_update.getCurrentTag() + if update_from_git_tags: + mon_pkg_update.update() + last_update_scan_timestamp = datetime.datetime.now() + mon_time = self_monitoring("version_check", loop_start if mon_time == 0 else mon_time) + +#===MOVE VARS TO PROMETHEUS EXPORTER============================================ + + pm.monitor("version", (version,), 1) + + if "sysinfo" in sys.modules: + pm.monitor("uptime", ("sensors",), uptime) + pm.monitor("cpu_count", ("sensors",), cpu["cpu_count"]) + pm.monitor("cpu_usage", ("sensors",), cpu["usage"]) + for disk in disks: + pm.monitor("disk_read", (disk,), disks[disk]["read"]) + pm.monitor("disk_write", (disk,), disks[disk]["write"]) + pm.monitor("disk_io_read", (disk,), disks[disk]["io_read"]) + pm.monitor("disk_io_write", (disk,), disks[disk]["io_write"]) + pm.monitor("disk_io_read_time", (disk,), disks[disk]["io_read_time"]) + pm.monitor("disk_io_write_time", (disk,), disks[disk]["io_write_time"]) + pm.monitor("disk_io_read_merged", (disk,), disks[disk]["io_read_merged"]) + pm.monitor("disk_io_write_merged", (disk,), disks[disk]["io_write_merged"]) + pm.monitor("disk_busy", (disk,), disks[disk]["busy"]) + + for core in cpu["frequency"]: + pm.monitor("cpu_frequency", (str(core),), cpu["frequency"][core]) + + for t in cpu["time_percent"]: + try: + pm.monitor("cpu_time_percent", (t,), cpu["time_percent"][t]) + except: + pm.add_monitor("cpu_time_percent", "%") + pm.monitor("cpu_time_percent", (t,), cpu["time_percent"][t]) + + for i in memory: + try: + pm.monitor("memory_"+i, ("memory",), memory[i]) + except: + pm.add_monitor("memory_"+i, "%") + pm.monitor("memory_"+i, ("memory",), memory[i]) + + if zfs != 0: + for pool in zfs: + if len(vols_to_scan["zfs"]) != 0: + if pool not in vols_to_scan["zfs"]: + continue + try: + pm.monitor("zfs_state", (pool,), zfs[pool]["state"]) + pm.monitor("zfs_size", (pool,), zfs[pool]["size"]) + pm.monitor("zfs_used", (pool,), zfs[pool]["used"]) + pm.monitor("zfs_free", (pool,), zfs[pool]["free"]) + pm.monitor("zfs_fragmentation", (pool,), zfs[pool]["fragmentation"]) + pm.monitor("zfs_dedup", (pool,), zfs[pool]["dedup"]) + except: + pm.add_monitor("zfs_state", "") + pm.add_monitor("zfs_size", "B") + pm.add_monitor("zfs_used", "B") + pm.add_monitor("zfs_free", "B") + pm.add_monitor("zfs_fragmentation", "%") + pm.add_monitor("zfs_dedup", "") + pm.monitor("zfs_state", (pool,), zfs[pool]["state"]) + pm.monitor("zfs_size", (pool,), zfs[pool]["size"]) + pm.monitor("zfs_used", (pool,), zfs[pool]["used"]) + pm.monitor("zfs_free", (pool,), zfs[pool]["free"]) + pm.monitor("zfs_fragmentation", (pool,), zfs[pool]["fragmentation"]) + pm.monitor("zfs_dedup", (pool,), zfs[pool]["dedup"]) + + for nic in network: + if len(nics_to_scan) != 0: + if nic not in nics_to_scan: + continue + try: + pm.monitor("network_rx", (nic,), network[nic]["rx"]) + pm.monitor("network_tx", (nic,), network[nic]["tx"]) + pm.monitor("network_err_rx", (nic,), network[nic]["err_rx"]) + pm.monitor("network_err_tx", (nic,), network[nic]["err_tx"]) + pm.monitor("network_drop_rx", (nic,), network[nic]["drop_rx"]) + pm.monitor("network_drop_tx", (nic,), network[nic]["drop_tx"]) + pm.monitor("network_packet_rx", (nic,), network[nic]["packet_rx"]) + pm.monitor("network_packet_tx", (nic,), network[nic]["packet_tx"]) + except: + pm.add_monitor("network_rx", "B") + pm.add_monitor("network_tx", "B") + pm.add_monitor("network_err_rx", "") + pm.add_monitor("network_err_tx", "") + pm.add_monitor("network_drop_rx", "") + pm.add_monitor("network_drop_tx", "") + pm.add_monitor("network_packet_rx", "") + pm.add_monitor("network_packet_tx", "") + pm.monitor("network_rx", (nic,), network[nic]["rx"]) + pm.monitor("network_tx", (nic,), network[nic]["tx"]) + pm.monitor("network_err_rx", (nic,), network[nic]["err_rx"]) + pm.monitor("network_err_tx", (nic,), network[nic]["err_tx"]) + pm.monitor("network_drop_rx", (nic,), network[nic]["drop_rx"]) + pm.monitor("network_drop_tx", (nic,), network[nic]["drop_tx"]) + pm.monitor("network_packet_rx", (nic,), network[nic]["packet_rx"]) + pm.monitor("network_packet_tx", (nic,), network[nic]["packet_tx"]) + + if partitions != 0: + for part in partitions: + if len(vols_to_scan["non_zfs"]) != 0: + if part not in vols_to_scan["non_zfs"]: + continue + try: + pm.monitor("partition_size", (part,), partitions[part]["size"]) + pm.monitor("partition_used", (part,), partitions[part]["used"]) + pm.monitor("partition_free", (part,), partitions[part]["free"]) + except: + pm.add_monitor("partition_size", "B") + pm.add_monitor("partition_used", "B") + pm.add_monitor("partition_free", "B") + pm.monitor("partition_size", (part,), partitions[part]["size"]) + pm.monitor("partition_used", (part,), partitions[part]["used"]) + pm.monitor("partition_free", (part,), partitions[part]["free"]) + + pm.monitor("system_info", (systeminfo["hostname"],systeminfo["kernel"],systeminfo["board"]), 1) + + pm.delete_monitor("user_sessions") + pm.delete_monitor("users") + + try: + for user in users: + pm.monitor("user_sessions", (user,), users[user]["sessions"]) + for session in users[user]["session"]: + pm.monitor("users", (user,users[user]["session"][session]["from"]), 1) + except: + pass + + pm.delete_monitor("ip_addrs") + for interface in ipaddrs: + pm.monitor("ip_addrs", (interface,ipaddrs[interface]), 1) + + if "ryzenPower" in sys.modules: + pm.monitor("package_power", ("sensors",), package) + core_total = 0 + for core in cores: + pm.monitor("core", (str(int(core/2)),), cores[core]) + core_total += cores[core] + pm.monitor("core_total", ("sensors",), core_total) + + if "kvmSensors" in sys.modules: + for fan in kvm["fans"]: + pm.monitor("fan_rpm", (kvm["fans"][fan]["id"],), kvm["fans"][fan]["rpm"]) + if kvm["fans"][fan]["status"] == "ok": + pm.monitor("fan_ok", (kvm["fans"][fan]["id"],), 1) + else: + pm.monitor("fan_ok", (kvm["fans"][fan]["id"],), 0) + + for temp in kvm["temp"]: + pm.monitor("temp_celsius", (kvm["temp"][temp]["id"],), kvm["temp"][temp]["temp"]) + if kvm["temp"][temp]["status"] == "ok": + pm.monitor("temp_ok", (kvm["temp"][temp]["id"],), 1) + else: + pm.monitor("temp_ok", (kvm["temp"][temp]["id"],), 0) + + for volt in kvm["volt"]: + pm.monitor("voltage", (kvm["volt"][volt]["id"],), kvm["volt"][volt]["voltage"]) + if kvm["volt"][volt]["status"] == "ok": + pm.monitor("voltage_ok", (kvm["volt"][volt]["id"],), 1) + else: + pm.monitor("voltage_ok", (kvm["volt"][volt]["id"],), 0) + + if "docker" in sys.modules: + pm.delete_monitor("docker_version") + pm.delete_monitor("docker_images") + pm.delete_monitor("docker_containers") + pm.delete_monitor("docker_volumes") + pm.delete_monitor("docker_build_cache") + + for container in containers: + try: + pm.monitor("docker_status", (container,), containers[container]["status"]) + pm.monitor("docker_health", (container,), containers[container]["health"]) + except: + pm.add_monitor("docker_status", "") + pm.add_monitor("docker_health", "") + pm.monitor("docker_status", (container,), containers[container]["status"]) + pm.monitor("docker_health", (container,), containers[container]["health"]) + + try: + pm.monitor("docker_version", (docker_info["version"],), 1) + for module in docker_size: + pm.monitor("docker_overall_info", (module,"count_total"), int(docker_size[module]["count_total"])) + pm.monitor("docker_overall_info", (module,"count_active"), int(docker_size[module]["count_active"])) + pm.monitor("docker_overall_size", (module,"used"), docker_size[module]["used"]) + pm.monitor("docker_overall_size", (module,"reclaimable"), docker_size[module]["reclaimable"]) + + for image in docker_info["images"]: + pm.monitor("docker_images", (image,docker_info["images"][image]["containers"],docker_info["images"][image]["created"],docker_info["images"][image]["repository"],docker_info["images"][image]["unique_size"]), docker_info["images"][image]["size"]) + + for container in docker_info["containers"]: + pm.monitor("docker_containers", (container,docker_info["containers"][container]["created"],docker_info["containers"][container]["image"],docker_info["containers"][container]["volumes"],docker_info["containers"][container]["mounts"],docker_info["containers"][container]["name"],docker_info["containers"][container]["networks"],docker_info["containers"][container]["runtime"],docker_info["containers"][container]["state"],docker_info["containers"][container]["status"]), docker_info["containers"][container]["size"]) + + for volume in docker_info["volumes"]: + pm.monitor("docker_volumes", (volume,docker_info["volumes"][volume]["driver"],docker_info["volumes"][volume]["links"],docker_info["volumes"][volume]["mountpoint"]), docker_info["volumes"][volume]["size"]) + + for cache in docker_info["buildcache"]: + pm.monitor("docker_build_cache", (cache,docker_info["buildcache"][cache]["type"],docker_info["buildcache"][cache]["created"],docker_info["buildcache"][cache]["in_use"],docker_info["buildcache"][cache]["last_use"],docker_info["buildcache"][cache]["shared"],docker_info["buildcache"][cache]["use_count"]), docker_info["buildcache"][cache]["size"]) + except Exception as e: + print(e) + + if "sensors" in sys.modules: + for temp in temperatures: + try: + for sensor in temperatures[temp]: + if "coretemp" in temp: + pm.monitor("temp_celsius_lm", (str(sensor),), temperatures[temp][sensor]) + else: + pm.monitor("temp_celsius_lm", (str(temp)+'_'+str(sensor),), temperatures[temp][sensor]) + except: + pm.add_monitor("temp_celsius_lm", "C") + for sensor in temperatures[temp]: + if "coretemp" in temp: + pm.monitor("temp_celsius_lm", (str(sensor),), temperatures[temp][sensor]) + else: + pm.monitor("temp_celsius_lm", (str(temp)+'_'+str(sensor),), temperatures[temp][sensor]) + + for fan in fans: + try: + for sensor in fans[fan]: + pm.monitor("fans_lm", (str(sensor),), fans[fan][sensor]) + except: + pm.add_monitor("fans_lm", "RPM") + for sensor in fans[fan]: + pm.monitor("fans_lm", (str(sensor),), fans[fan][sensor]) + + for voltage in voltages: + try: + for sensor in voltages[voltage]: + pm.monitor("voltages_lm", (str(sensor),), voltages[voltage][sensor]) + except: + pm.add_monitor("voltages_lm", "V") + for sensor in voltages[voltage]: + pm.monitor("voltages_lm", (str(sensor),), voltages[voltage][sensor]) + + if "intelPower" in sys.modules: + try: + pm.monitor("intel_cpu_power", ("core",), cores) + pm.monitor("intel_cpu_power", ("package",), package) + except: + pm.add_monitor("intel_cpu_power", "W") + pm.monitor("intel_cpu_power", ("core",), cores) + pm.monitor("intel_cpu_power", ("package",), package) + + if "RAS" in sys.modules: + for item in ras: + try: + for tag in ras[item]: + if tag == "total_errors": + pm.monitor("ras_total", (item.replace(' ','_'),), ras[item][tag]) + else: + for error in ras[item][tag]: + pm.monitor("ras_"+error, (item.replace(' ','_')+"_"+tag,), ras[item][tag][error]) + except: + for tag in ras[item]: + if tag == "total_errors": + try: + pm.add_monitor("ras_total", "") + pm.monitor("ras_total", (item.replace(' ','_'),), ras[item][tag]) + except: + pass + else: + for error in ras[item][tag]: + try: + pm.add_monitor("ras_"+error, "") + pm.monitor("ras_"+error, (item.replace(' ','_')+"_"+tag,), ras[item][tag][error]) + except: + pass + + if "SMART" in sys.modules: + try: + pm.add_monitor("smart_bytes_written", "B", tags=("serial","instance")) + except: + pass + + pm.delete_monitor("smart_attributes") + + for device in smart: + serial = smart[device]["serial_number"] + + pm.monitor("smart_bytes_written", (serial,device), smart[device]["bytes_written"]) + + for attr in smart[device]["data"]: + if smart[device]["type"] == "ATA": + try: + pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]["raw"]) + pm.monitor("smart_value_"+attr, (serial,device), smart[device]["data"][attr]["value"]) + pm.monitor("smart_thr_"+attr, (serial,device), smart[device]["data"][attr]["thr"]) + pm.monitor("smart_worst_"+attr, (serial,device), smart[device]["data"][attr]["worst"]) + pm.monitor("smart_attributes", (serial,device,attr,smart[device]["data"][attr]["id"],smart[device]["data"][attr]["value"],smart[device]["data"][attr]["thr"],smart[device]["data"][attr]["worst"],smart[device]["data"][attr]["raw"]), smart[device]["data"][attr]["raw"]) + except: + try: + pm.add_monitor("smart_raw_"+attr, "", tags=("serial","instance")) + pm.add_monitor("smart_value_"+attr, "", tags=("serial","instance")) + pm.add_monitor("smart_thr_"+attr, "", tags=("serial","instance")) + pm.add_monitor("smart_worst_"+attr, "", tags=("serial","instance")) + pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]["raw"]) + pm.monitor("smart_value_"+attr, (serial,device), smart[device]["data"][attr]["value"]) + pm.monitor("smart_thr_"+attr, (serial,device), smart[device]["data"][attr]["thr"]) + pm.monitor("smart_worst_"+attr, (serial,device), smart[device]["data"][attr]["worst"]) + except: + pass + else: + try: + pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]) + pm.monitor("smart_attributes", (serial,device,attr,"","","","",smart[device]["data"][attr]), smart[device]["data"][attr]) + except: + try: + pm.add_monitor("smart_raw_"+attr, "", tags=("serial","instance")) + pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]) + except: + pass + + if "procMon" in sys.modules: + try: + pm.add_monitor("proc_summary", "", tags=("PID","CPU","VIRT","RAM","% RAM","THR","STARTTIME","RUNTIME","PARENT","STATE","COMM")) + pm.add_monitor("proc_cpu", "%") + pm.add_monitor("proc_memory_used", "B") + pm.add_monitor("proc_memory_virt", "B") + pm.add_monitor("proc_memory_percent", "%") + pm.add_monitor("proc_page_fault_minor", "") + pm.add_monitor("proc_page_fault_major", "") + except: + pass + + pm.delete_monitor("proc_summary") + pm.delete_monitor("proc_cpu") + pm.delete_monitor("proc_memory_used") + pm.delete_monitor("proc_memory_virt") + pm.delete_monitor("proc_memory_percent") + pm.delete_monitor("proc_page_fault_minor") + pm.delete_monitor("proc_page_fault_major") + + for pid in proc: + try: + pm.monitor("proc_summary", (proc[pid]["id"],proc[pid]["cpu"],str(int(proc[pid]["virt"]) / 1000)+" kB",str(int(proc[pid]["memory"]) / 1000)+" kB",round(float(proc[pid]["memory_percent"]),2),proc[pid]["threadcnt"],proc[pid]["starttime"],proc[pid]["runtime_seconds"],proc[pid]["parent_pid"],proc[pid]["state"],proc[pid]["comm"]), 1) + pm.monitor("proc_cpu", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["cpu"]) + pm.monitor("proc_memory_used", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["memory"]) + pm.monitor("proc_memory_virt", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["virt"]) + pm.monitor("proc_memory_percent", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["memory_percent"]) + pm.monitor("proc_page_fault_minor", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["page_fault_minor"]) + pm.monitor("proc_page_fault_major", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["page_fault_major"]) + except: + pass + + if "cpuinfo" in sys.modules: + pm.monitor("cpu_info", (cpuinfo["vendor"],cpuinfo["model"],cpuinfo["cpus"]), 1) + + if "gpu" in sys.modules: + pm.delete_monitor("gpu_info") + pm.delete_monitor("gpu_util") + pm.delete_monitor("gpu_throttle") + + for device in gpuinfo["gpu"]: + pm.monitor("gpu_info", (gpuinfo["gpu"][device]["product_name"], gpuinfo["gpu"][device]["vbios_version"], gpuinfo["about"]["driver_version"], gpuinfo["gpu"][device]["performance_state"], device), 1) + + for item in gpuinfo["gpu"][device]["util"]: + pm.monitor("gpu_util", (gpuinfo["gpu"][device]["product_name"], device, item), gpuinfo["gpu"][device]["util"][item]) + for item in gpuinfo["gpu"][device]["throttle"]: + pm.monitor("gpu_throttle", (gpuinfo["gpu"][device]["product_name"], device, item), gpuinfo["gpu"][device]["throttle"][item]) + + pm.monitor("gpu_fan_speed", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["fan_speed"]) + pm.monitor("gpu_memory_used", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["memory_used"]) + pm.monitor("gpu_memory_total", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["memory_total"]) + pm.monitor("gpu_temp", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["temp"]) + pm.monitor("gpu_power", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["power"]) + pm.monitor("gpu_power_limit", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["power_limit"]) + pm.monitor("gpu_gpu_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["gpu_clock"]) + pm.monitor("gpu_mem_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["mem_clock"]) + pm.monitor("gpu_sm_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["sm_clock"]) + pm.monitor("gpu_video_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["video_clock"]) + pm.monitor("gpu_gpu_clock_max", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["gpu_max_clock"]) + pm.monitor("gpu_mem_clock_max", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["mem_max_clock"]) + + if "packages" in sys.modules: + pm.delete_monitor("update_pkg_updatable") + pm.delete_monitor("update_pkg_count") + pm.delete_monitor("installed_pkg_count") + pm.delete_monitor("installed_pkgs") + pm.delete_monitor("installed_pkg_size") + + pm.monitor("update_pkg_count", (up["package_mgr"],), len(up["updatable"])) + pm.monitor("installed_pkg_count", (up["package_mgr"],), len(up["installed"])) + pm.monitor("installed_pkg_size", (up["package_mgr"],), up["total_size"]) + + try: + for package in up["installed"]: + pm.monitor("installed_pkgs", (package, up["installed"][package]["version"], up["installed"][package]["repository"]), up["installed"][package]["size"]) + except: + pass + if len(up["updatable"]) > 0: + for package in up["updatable"]: + pm.monitor("update_pkg_updatable", (package, up["updatable"][package]["version"], up["updatable"][package]["repository"]), 1) + + mon_time = self_monitoring("prom_export", loop_start if mon_time == 0 else mon_time) + loop_time_spent = self_monitoring("hw-mon-loop", loop_start) + + if (loop_start - datetime.datetime.now()).total_seconds() < 1: + sleep(1 - (loop_start - datetime.datetime.now()).total_seconds()) diff --git a/mon_pkg_update.py b/mon_pkg_update.py new file mode 100644 index 0000000..7a3a00d --- /dev/null +++ b/mon_pkg_update.py @@ -0,0 +1,46 @@ +import subprocess + +def pullTags(): + result = subprocess.run(['git', 'fetch', '--tags'], stdout=subprocess.PIPE) + +def getCurrentTag(): + result = subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1] + + result = subprocess.run(['git', 'name-rev', '--tags', '--name-only', result], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1] + + return result + +def checkTag(): + pullTags() + + result = subprocess.run(['git', 'tag'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split('\n') + + latest_tag = getCurrentTag() + + for tag in result: + if float(tag[1:]) > float(latest_tag[1:]): + latest_tag = tag + + return latest_tag + +def update(recursive=False): + current_tag = getCurrentTag() + latest_tag = checkTag() + + if recursive: + print(current_tag, latest_tag) + + if current_tag != latest_tag: + print("update") + result = subprocess.run(['git', 'checkout', latest_tag], stdout=subprocess.PIPE) + update(recursive=True) + else: + print("no update nessesary") + + result = subprocess.run(['systemctl', 'restart', 'hardware-monitor.service'], stdout=subprocess.PIPE) + +if __name__ == "__main__": + update() diff --git a/packages.py b/packages.py new file mode 100644 index 0000000..b505ab0 --- /dev/null +++ b/packages.py @@ -0,0 +1,140 @@ +import re +import subprocess +from time import sleep + +def _getDist(): + with open("/etc/os-release", "r") as f: + release = f.read().split('\n') + for line in release: + try: + s = line.split("=") + if s[0] == "ID": + os_id = s[1] + break + except: + pass + + return os_id + +def _isProxmox(): + ispve = {} + ispve["ispve"] = False + try: + result = subprocess.run(['pveversion'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + ispve["ispve"] = True + ispve["string"] = result + except FileNotFoundError: + pass + + return ispve + +def _sizeMultiplier(unit): + if unit == "KiB": + return 1 + elif unit == "MiB": + return 1024 + elif unit == "GiB": + return 1024**2 + elif unit == "TiB": + return 1024**3 + else: + return 0 + +def getPackages(): + dist = _getDist() + ispve = _isProxmox() + update_cmd = "apt update" + pkgs = {} + pkgs["updatable"] = {} + pkgs["installed"] = {} + + if dist != "debian": + if dist == "arch": + update_cmd = "pacman -Sup --print-format %r,%n,%v" + + if "apt" in update_cmd: + pkgs["package_mgr"] = "apt" + result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE) + update_cmd = "apt list --upgradable" + result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[11:-1].split("\n") + if result[0] == '': + result = [] + for pkg in result: + try: + pkgs["updatable"][pkg.split(" ")[0].split("/")[0]] = {} + pkgs["updatable"][pkg.split(" ")[0].split("/")[0]]["version"] = pkg.split(" ")[1] + pkgs["updatable"][pkg.split(" ")[0].split("/")[0]]["repository"] = pkg.split(" ")[0].split("/")[1] + except: + pass + result = subprocess.run(["apt", "list"], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[11:-1].split("\n") + for pkg in result: + pkgs["installed"][pkg.split(" ")[0].split("/")[0]] = {} + pkgs["installed"][pkg.split(" ")[0].split("/")[0]]["version"] = pkg.split(" ")[1] + pkgs["installed"][pkg.split(" ")[0].split("/")[0]]["repository"] = pkg.split(" ")[0].split("/")[1] + result = subprocess.run(["dpkg-query","-W","--showformat='${Package} ${Installed-Size}\n'"], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-2].replace("'", "").split("\n") + for pkg in result: + try: + size = float(pkg.split(" ")[1]) + except: + size = 0.0 + pkgs["installed"][pkg.split(" ")[0]]["size"] = size + insld = {} + for pkg in pkgs["installed"]: + if "size" in pkgs["installed"][pkg]: + insld[pkg] = pkgs["installed"][pkg] + pkgs["installed"] = insld + + if "pacman" in update_cmd: + pkgs["package_mgr"] = "pacman" + result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split("\n") + for pkg in result: + pkgs["updatable"][pkg.split(",")[1]] = {} + pkgs["updatable"][pkg.split(",")[1]]["version"] = pkg.split(",")[2] + pkgs["updatable"][pkg.split(",")[1]]["repository"] = pkg.split(",")[0] + result = subprocess.run(["pacman", "-Qn"], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split("\n") + for pkg in result: + pkgs["installed"][pkg.split(" ")[0]] = {} + pkgs["installed"][pkg.split(" ")[0]]["version"] = pkg.split(" ")[1] + pkgs["installed"][pkg.split(" ")[0]]["repository"] = "pacman.conf" + result = subprocess.run(["pacman", "-Qm"], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split("\n") + for pkg in result: + pkgs["installed"][pkg.split(" ")[0]] = {} + pkgs["installed"][pkg.split(" ")[0]]["version"] = pkg.split(" ")[1] + pkgs["installed"][pkg.split(" ")[0]]["repository"] = "user/AUR" + result = subprocess.run(["pacman", "-Qi"], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8')[:-1].split("\n\n") + + for pkg in result: + p = pkg.split("\n") + for i in p: + if "Name" in i: + package = i.split(": ")[1] + if "Installed Size" in i: + size = i.split(": ")[1] + try: + pkgs["installed"][package]["size"] = float(size.split(" ")[0]) * _sizeMultiplier(size.split(" ")[1]) + except: + pass + + total_size = 0 + for pkg in pkgs["installed"]: + try: + total_size += pkgs["installed"][pkg]["size"] + except: + pass + pkgs["total_size"] = total_size + + return pkgs + +if __name__ == "__main__": + #print(_getDist()) + #print(_isProxmox()) + print(getPackages()) + print(len(getPackages()['installed'])) diff --git a/procMon.py b/procMon.py new file mode 100644 index 0000000..21b842d --- /dev/null +++ b/procMon.py @@ -0,0 +1,128 @@ +import promMon as pm +import os +import subprocess +from time import sleep + +result = subprocess.run(['getconf', 'CLK_TCK'], stdout=subprocess.PIPE) +result = result.stdout.decode('utf-8') + +tikspersec = int(result) + +def getPIDs(): + pids = [] + for i in os.listdir("/proc"): + if i.isdigit(): + pids.append(i) + + return pids + +def parseStatus(pid): + with open("/proc/"+str(pid)+"/status", "r") as f: + status = f.read().replace("\t","").split("\n") + status.pop(-1) + status_dict = {} + for item in status: + i = item.split(":") + try: + k = i[1].split(" ") + data = [] + for j in k: + if j != "": + data.append(j) + status_dict[i[0]] = data[0]+" "+data[1] + except Exception as e: + pass + + return status_dict + +def parseMeminfo(): + with open("/proc/meminfo", "r") as f: + meminfo = f.read().split("\n") + meminfo_dict = {} + for i in meminfo: + i = i.split(" ") + vals = [] + for j in i: + if j != "": + vals.append(j.replace(":","")) + try: + meminfo_dict[vals[0]] = int(vals[1]) * byteMult(vals[2]) + except: + pass + return meminfo_dict + +def byteMult(value): + if value == "B": + return 1 + elif value == "kB": + return 1000 + elif value == "MB": + return 1000000 + elif value == "GB": + return 1000000000 + elif value == "TB": + return 1000000000000 + +def exportProcesses(last_proc_stat=0): + pids = getPIDs() + processes = {} + + with open("/proc/uptime", "r") as f: + uptime = float(f.read().split(" ")[0]) + + for pid in pids: + try: + with open("/proc/"+pid+"/stat", "r") as f: + stat = f.read().replace("(", "").split(") ") + stat[1] = stat[1].replace('\n', "") + s0 = stat[0].split(" ", 1) + s1 = stat[1].split(" ") + stat = s0 + s1 + + processes[pid] = {} + processes[pid]["id"] = stat[0] + processes[pid]["comm"] = stat[1] + processes[pid]["virt"] = stat[22] + processes[pid]["cpu_tiks_user"] = int(stat[13]) + processes[pid]["cpu_tiks_system"] = int(stat[14]) + processes[pid]["cpu_secs"] = (int(stat[13]) + int(stat[14])) / tikspersec + processes[pid]["page_fault_major"] = stat[11] + processes[pid]["page_fault_minor"] = stat[9] + processes[pid]["starttime"] = int(stat[21]) / tikspersec + processes[pid]["threadcnt"] = stat[19] + processes[pid]["state"] = stat[2] + processes[pid]["parent_pid"] = stat[3] + processes[pid]["runtime_seconds"] = uptime - processes[pid]["starttime"] + processes[pid]["cpu_total"] = 100 * processes[pid]["cpu_secs"] / processes[pid]["runtime_seconds"] + processes[pid]["cpu"] = 0 + + if last_proc_stat != 0: + try: + cpu_secs = processes[pid]["cpu_secs"] - last_proc_stat[pid]["cpu_secs"] + sampletime = processes[pid]["runtime_seconds"] - last_proc_stat[pid]["runtime_seconds"] + processes[pid]["cpu"] = 100 * cpu_secs / sampletime + except: + pass + + status = parseStatus(pid) + meminfo = parseMeminfo() + memory = status["VmData"].split(" ") + if memory[0] == "": + memory.pop(0) + processes[pid]["memory"] = int(memory[0]) * byteMult(memory[1]) + processes[pid]["memory_percent"] = 100 * (int(memory[0]) * byteMult(memory[1])) / meminfo["MemTotal"] + except Exception as e: +# print(e) + pass + + return processes + + +if __name__ == "__main__": + proc = exportProcesses() + for i in range(0,10): + proc = exportProcesses(proc) + for pid in proc: + print(proc[pid]) + sleep(2) + diff --git a/promMon.py b/promMon.py new file mode 100644 index 0000000..a9ef41a --- /dev/null +++ b/promMon.py @@ -0,0 +1,58 @@ +# Author: Antonin Kaplan +# Date: 2025-12-23 +# +# Prometheus client library wrapper for easier usage +# +# On path through the deepest forest even the dimmest light shines on your path to enlightenment + +from prometheus_client import start_http_server, Gauge, Counter + +# create prometheus monitoring object +# @param name of the monitores app for example: hw-monitor +# @param port of the exported endpoint which can be scraped by prometheus +# @retval None +class prometheus: + def __init__(self, name="promMon", port=8000): + self.name = name + self.port = port + self.monitors = {} + + start_http_server(self.port) + +# create variable for monitoring and add it to dict +# @param type of variable (Gauge, Counter) +# @param name of monitored value +# @param unit of measurement of specified value +# @retval None + def add_monitor(self, name, unit, tags=["instance"], type="Gauge"): + if type == "Gauge": + self.monitors[name] = Gauge(self.name+"_"+name, unit, tags) + elif type == "Counter": + self.monitors[name] = Counter(self.name+"_"+name, unit, tags) + else: + pass + +# delete all tags from monitoring variable (variable stays) useful when storing data in tags like SMART or processes +# @param name of monitored value +# retval none + def delete_monitor(self, name): + self.monitors[name].clear() + +# monitor function for updating monitored values +# @param name of monitored value +# @param instance identificator eg. IP address +# @param value of monitored variable +# @retval None + def monitor(self, name, tags, value): + self.monitors[name].labels(*tags).set(value) + +if __name__ == '__main__': + from time import sleep + + pm = prometheus(port=9339, name="test") + pm.add_monitor("time", "s", tags=["ip"]) + time = 0 + while True: + pm.monitor("time", ("localhost",), time) + time += 1 + sleep(1) diff --git a/ryzenPower.py b/ryzenPower.py new file mode 100644 index 0000000..97c2145 --- /dev/null +++ b/ryzenPower.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 + +# ryzen-power: measure AMD Ryzen CPU power consumption. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +# +# This program is a Python port of rapl-read-ryzen +# https://github.com/djselbeck/rapl-read-ryzen + +import logging +import os.path +import argparse +from itertools import count +from struct import unpack +from time import sleep +from warnings import warn +from datetime import datetime + +logger = logging.getLogger("ryzen-master") + + +class RyzenPower: + AMD_MSR_PWR_UNIT_OFFSET = 0xC0010299 + AMD_MSR_CORE_ENERGY_OFFSET = 0xC001029A + AMD_MSR_PACKAGE_ENERGY_OFFSET = 0xC001029B + AMD_TIME_UNIT_MASK = 0xF0000 + AMD_ENERGY_UNIT_MASK = 0x1F00 + AMD_POWER_UNIT_MASK = 0xF + + def __init__(self, duration=1.0): + self._energy_unit = self._get_energy_units() + self._is_smt = self._detect_smt() + self._package_topology = self._detect_physical_package_topology() + self._duration = duration + self._cores = list(self._package_topology.keys()) + if self._is_smt: + self._cores = [c for c in self._cores if c % 2 == 0] + self._cores = sorted(self._cores) + self._msr_fd_cache = {} + self.timestamp = 0 + self.package_energy = -1 + self.core_energy = -1 + + @staticmethod + def _read(filename): + with open(filename, "r") as f: + return f.read() + + def _detect_smt(self): + try: + smt_status = self._read("/sys/devices/system/cpu/smt/control").strip() + logger.debug("CPU smt status is {}".format(smt_status)) + return smt_status == "on" + except FileNotFoundError: + warn("unable to detect CPU SMT status, assume SMT is on") + return True + + @staticmethod + def _detect_physical_package_topology(): + cpu_package_mapping = {} + for cpu_id in count(): + filename = "/sys/devices/system/cpu/cpu{}/topology/physical_package_id".format(cpu_id) + if os.path.isfile(filename): + with open(filename, "r") as f: + package_id = int(f.read()) + logger.debug("detected cpu {} in socket {}".format(cpu_id, package_id)) + cpu_package_mapping[cpu_id] = package_id + else: + return cpu_package_mapping + + def _read_msr(self, cpu_id, offset): + msr_file = "/dev/cpu/{}/msr".format(cpu_id) + try: + with open(msr_file, "rb", buffering=8192) as f: + f.seek(offset) + # MSR value is always 64 bits + # https://manpages.debian.org/buster/manpages/msr.4.en.html + return self._decode_int64(f.read(8)) + except PermissionError: + raise PermissionError("root privilege is required to read model-specific registers") + except FileNotFoundError: + raise FileNotFoundError("msr driver is not loaded, try \"sudo modprobe msr\" to load msr module") + + @staticmethod + def _decode_int64(buffer): + return unpack("q", buffer)[0] + + def _read_all_units(self): + return self._read_msr(0, self.AMD_MSR_PWR_UNIT_OFFSET) + + def _get_energy_units(self): + energy_unit = (self._read_all_units() & self.AMD_ENERGY_UNIT_MASK) >> 8 + logger.debug("CPU energy unit is 1/2^{}".format(energy_unit)) + energy_unit = 0.5 ** energy_unit + return energy_unit + + def _read_package_energy(self, cpu_id): + energy = self._read_msr(cpu_id, self.AMD_MSR_PACKAGE_ENERGY_OFFSET) + logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit)) + return energy + + def _read_core_energy(self, cpu_id): + energy = self._read_msr(cpu_id, self.AMD_MSR_CORE_ENERGY_OFFSET) + logger.debug("CPU {} current core energy {} * {} J".format(cpu_id, energy, self._energy_unit)) + return energy + + def _calc_power(self, before, after): + return (after - before) * self._energy_unit / self._duration + + def _calc_power_wtime(self, before, after, duration): + return (after - before) * self._energy_unit / duration + + def measure(self): + package_energy_before = {c: self._read_package_energy(c) for c in self._cores} + core_energy_before = {c: self._read_core_energy(c) for c in self._cores} + logger.debug("sleep for {} seconds".format(self._duration)) + sleep(self._duration) + package_energy_after = {c: self._read_package_energy(c) for c in self._cores} + core_energy_after = {c: self._read_core_energy(c) for c in self._cores} + package_power = {c: self._calc_power(package_energy_before[c], package_energy_after[c]) for c in self._cores} + core_power = {c: self._calc_power(core_energy_before[c], core_energy_after[c]) for c in self._cores} + return package_power , core_power + #print(self._format_result(package_power, core_power)) + + def measure_nonblocking(self): + timestamp = datetime.now() + package_energy = {c: self._read_package_energy(c) for c in self._cores} + core_energy = {c: self._read_core_energy(c) for c in self._cores} + + if self.package_energy != -1: + time_delta = (timestamp - self.timestamp).total_seconds() + package_power = {c: self._calc_power_wtime(self.package_energy[c], package_energy[c], time_delta) for c in self._cores} + core_power = {c: self._calc_power_wtime(self.core_energy[c], core_energy[c], time_delta) for c in self._cores} + else: + for c in self._cores: + package_power = {c: 0 for c in self._cores} + core_power = {c: 0 for c in self._cores} + + self.package_energy = package_energy + self.core_energy = core_energy + self.timestamp = timestamp + + avg_pp = 0 + for c in self._cores: + avg_pp += package_power[0] + package_power = avg_pp / len(self._cores) + + return package_power, core_power + + @staticmethod + def _format_table(table, widths, units): + buffer = [] + for row in table: + row_buffer = [] + for col, width, unit in zip(row, widths, units): + if isinstance(col, float): + row_buffer.append("{:.2f}{}".format(col, unit).ljust(width)) + else: + row_buffer.append(str(col).ljust(width)) + buffer.append("".join(row_buffer)) + return "\n".join(buffer) + + def _format_result(self, package_power, core_power): + sockets = sorted(set(self._package_topology.values())) + table = [["", "Cores Power", "Package Power"]] + for socket in sockets: + socket_total_cores_power = 0 + socket_package_power = 0 + socket_power_entry = ["SOCKET {: 2}:".format(socket)] + table.append(socket_power_entry) + for core in self._cores: + if self._package_topology[core] == socket: + socket_total_cores_power += core_power[core] + socket_package_power = package_power[core] + table.append([ + " CORE {: 2}:".format(core // 2 if self._is_smt else core), + core_power[core], + "" + ]) + socket_power_entry.append(socket_total_cores_power) + socket_power_entry.append(socket_package_power) + return self._format_table(table, (16, 16, 16), ("", "W", "W")) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Measure power consumption for AMD Ryzen CPU') + parser.add_argument("--debug", action='store_true', help="show debug messages") + parser.add_argument("-d", "--duration", type=float, default=0.5, + help="the duration of measurement in seconds, default is 0.5 second") + args = parser.parse_args() + if args.debug: + stream_handler = logging.StreamHandler() + logger.addHandler(stream_handler) + logger.setLevel(logging.DEBUG) + + package, core = RyzenPower(args.duration).measure() + print(RyzenPower(args.duration)._format_result(package, core)) diff --git a/sensors.py b/sensors.py new file mode 100644 index 0000000..df1c711 --- /dev/null +++ b/sensors.py @@ -0,0 +1,65 @@ +import subprocess +import json + +def getSensors(): + result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + devices = json.loads(result) + + return devices + +def getTemps(): + devices = getSensors() + + temps = {} + for device in devices: + if "nct" in device: + continue + temps[device] = {} + for sensor in devices[device]: + for data in devices[device][sensor]: + if "temp" in data: + if "input" in data: + temps[device][sensor] = devices[device][sensor][data] + return temps + +def getVoltages(): + devices = getSensors() + + voltages = {} + for device in devices: + if "nct" not in device: + continue + + voltages[device] = {} + for sensor in devices[device]: + if "in" in sensor: + for data in devices[device][sensor]: + if "input" in data: + voltages[device][sensor] = devices[device][sensor][data] + return voltages + +def getFans(): + devices = getSensors() + + fans = {} + for device in devices: + if "nct" not in device: + continue + + fans[device] = {} + for sensor in devices[device]: + if "fan" in sensor: + for data in devices[device][sensor]: + if "input" in data: + fans[device][sensor] = devices[device][sensor][data] + return fans + + +if __name__ == "__main__": + print(getTemps()) + print("") + print(getVoltages()) + print("") + print(getFans()) diff --git a/sysinfo.py b/sysinfo.py new file mode 100644 index 0000000..3da2042 --- /dev/null +++ b/sysinfo.py @@ -0,0 +1,244 @@ +import psutil +import platform +from datetime import datetime +import subprocess +import json +import socket + +def getBoard(): + result = subprocess.run(['dmidecode', '-t', '1'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8').replace("\t","").split("\n") + + for item in result: + if "Product Name" in item: + return item.split(":")[1][1:] + return "" + +def getSysInfo(): + uname = platform.uname() + + info = {} + info["hostname"] = uname.node + info["kernel"] = uname.release + info["version"] = uname.version + info["board"] = getBoard() + + return info + +def getDisk(): + disk_io = psutil.disk_io_counters(perdisk=True) + disk_io_dict = {} + + for disk in disk_io: + read_data = False + if "nvme" in disk: + if "p" not in disk: + read_data = True + + if not any(char.isdigit() for char in disk): + read_data = True + + if read_data: + data = {} + data["read"] = disk_io[disk].read_bytes + data["write"] = disk_io[disk].write_bytes + data["io_read"] = disk_io[disk].read_count + data["io_write"] = disk_io[disk].write_count + data["io_read_time"] = disk_io[disk].read_time + data["io_write_time"] = disk_io[disk].write_time + data["io_read_merged"] = disk_io[disk].read_merged_count + data["io_write_merged"] = disk_io[disk].write_merged_count + data["busy"] = disk_io[disk].busy_time + disk_io_dict[disk] = data + + return disk_io_dict + +def getCPU(): + cpu_dict = {} + cpu_dict["time_percent"] = {} + cpu_dict["frequency"] = {} + + cpu_time = psutil.cpu_times_percent() + freq = psutil.cpu_freq(percpu=True) + + core_index = 0 + for cpu in freq: + cpu_dict["frequency"][core_index] = cpu.current + core_index += 1 + cpu_dict["usage"] = psutil.cpu_percent() + cpu_dict["cpu_count"] = psutil.cpu_count() + cpu_dict["time_percent"]["user"] = cpu_time.user + cpu_dict["time_percent"]["nice"] = cpu_time.nice + cpu_dict["time_percent"]["system"] = cpu_time.system + cpu_dict["time_percent"]["idle"] = cpu_time.idle + cpu_dict["time_percent"]["iowait"] = cpu_time.iowait + cpu_dict["time_percent"]["irq"] = cpu_time.irq + cpu_dict["time_percent"]["softirq"] = cpu_time.softirq + cpu_dict["time_percent"]["steal"] = cpu_time.steal + cpu_dict["time_percent"]["guest"] = cpu_time.guest + cpu_dict["time_percent"]["guest_nice"] = cpu_time.guest_nice + + return cpu_dict + +def getMemory(): + mem_dict = {} + + mem = psutil.virtual_memory() + swap = psutil.swap_memory() + + mem_dict["total"] = mem.total + mem_dict["available"] = mem.available + mem_dict["percent"] = mem.percent + mem_dict["used"] = mem.used + mem_dict["free"] = mem.free + mem_dict["active"] = mem.active + mem_dict["inactive"] = mem.inactive + mem_dict["buffers"] = mem.buffers + mem_dict["cached"] = mem.cached + mem_dict["shared"] = mem.shared + mem_dict["slab"] = mem.slab + mem_dict["swap_total"] = swap.total + mem_dict["swap_used"] = swap.used + mem_dict["swap_free"] = swap.free + mem_dict["swap_percent"] = swap.percent + mem_dict["swap_in"] = swap.sin + mem_dict["swap_out"] = swap.sout + + return mem_dict + +def getPartitions(): + part_dict = {} + partitions = psutil.disk_partitions() + + for part in partitions: + name = part.device.split('/')[-1] + if "loop" not in name: + part_dict[name] = {} + + part_dict[name]["size"] = psutil.disk_usage(part.mountpoint).total + part_dict[name]["used"] = psutil.disk_usage(part.mountpoint).used + part_dict[name]["free"] = psutil.disk_usage(part.mountpoint).free + + return part_dict + +def getZFS(): + zfs_dict = {} + + result = subprocess.run(['zpool', 'list', '-jHp'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + zfs = json.loads(result) + + for pool in zfs["pools"]: + zfs_dict[pool] = {} + match zfs["pools"][pool]["state"]: + case "ONLINE": + zfs_dict[pool]["state"] = 2 + case "DEGRADED": + zfs_dict[pool]["state"] = 1 + case _: + zfs_dict[pool]["state"] = 0 + zfs_dict[pool]["size"] = int(zfs["pools"][pool]["properties"]["size"]["value"]) + zfs_dict[pool]["used"] = int(zfs["pools"][pool]["properties"]["allocated"]["value"]) + zfs_dict[pool]["free"] = int(zfs["pools"][pool]["properties"]["free"]["value"]) + zfs_dict[pool]["fragmentation"] = int(zfs["pools"][pool]["properties"]["fragmentation"]["value"]) + zfs_dict[pool]["dedup"] = float(zfs["pools"][pool]["properties"]["dedupratio"]["value"]) + + return zfs_dict + +def getUptime(): + boot = psutil.boot_time() + uptime = datetime.now().timestamp() - boot + + return uptime + +def getNetwork(): + net_dict = {} + net = psutil.net_io_counters(pernic=True) + + for nic in net: + if "fw" not in nic: + if "lo" not in nic: + if "br" not in nic: + net_dict[nic] = {} + net_dict[nic]["rx"] = net[nic].bytes_recv + net_dict[nic]["tx"] = net[nic].bytes_sent + net_dict[nic]["err_rx"] = net[nic].errin + net_dict[nic]["err_tx"] = net[nic].errout + net_dict[nic]["drop_rx"] = net[nic].dropin + net_dict[nic]["drop_tx"] = net[nic].dropout + net_dict[nic]["packet_tx"] = net[nic].packets_sent + net_dict[nic]["packet_rx"] = net[nic].packets_recv + + return net_dict + +def getIP(): + addresses = psutil.net_if_addrs() + + addr = {} + + for interface in addresses: + for type in addresses[interface]: + if type.family == socket.AF_INET: + addr[interface] = type.address + + return addr + + +def users(): + result = subprocess.run(['w'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8').split('\n') + + if "FROM" not in result[1]: + result = subprocess.run(['w', '-f'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8').split('\n') + + result.pop(-1) + + header = [] + users = {} + + if len(result) > 2: + for i in range(1,len(result)): + a = result[i].split(" ") + user = [] + for item in a: + if item != "": + if i == 1: + header.append(item) + else: + user.append(item) + active_user = "" + for item in range(0,len(header)): + try: + if "USER" in header[item]: + if user[item] in users: + users[user[item]]["sessions"] = users[user[item]]["sessions"] + 1 + else: + users[user[item]] = {} + users[user[item]]["sessions"] = 1 + users[user[item]]["session"] = {} + active_user = user[item] + if "FROM" in header[item]: + users[active_user]["session"][users[active_user]["sessions"]] = {} + if user[item].count(".") == 3: + users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item] + else: + if user[item] == "-": + users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item] + else: + users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item-1] + except: + pass + + return users + +if __name__ == "__main__": +# print(getSysInfo()) + print(getDisk()) +# print(users()) +# print(getCPU()) +# print(getMemory()) +# print(getZFS()) + print(getPartitions()) +# print(getUptime()) +# print(getNetwork()) diff --git a/temps.py b/temps.py new file mode 100644 index 0000000..bbe68f8 --- /dev/null +++ b/temps.py @@ -0,0 +1,27 @@ +import subprocess +import json + +def getSensors(): + result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE) + result = result.stdout.decode('utf-8') + + print(result) + devices = json.loads(result) + + return devices + +def getTemps(): + devices = getSensors() + + temps = {} + for device in devices: + temps[device] = {} + for sensor in devices[device]: + for data in devices[device][sensor]: + if "temp" in data: + if "input" in data: + temps[device][sensor] = devices[device][sensor][data] + return temps + +if __name__ == "__main__": + getTemps()