initial commit

This commit is contained in:
2026-04-27 11:35:05 +02:00
parent b8e85624b9
commit feda943270
19 changed files with 2381 additions and 2 deletions

61
RAS.py Normal file
View File

@@ -0,0 +1,61 @@
import subprocess
def convertToDict(RAS_dump):
ras_status = {}
current_driver = ""
lines = RAS_dump.split('\n')
for line in lines:
if line == '':
continue
if '\t' in line:
try:
item = line.split(': ')
ras_status[current_driver][item[1]] = {}
if "Corrected" in line:
ras_status[current_driver][item[1]]["corrected"] = int(line.replace('\t', '').split(' ')[0])
ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[0])
else:
ras_status[current_driver][item[1]]["uncorrected"] = int(line.replace('\t', '').split(' ')[0])
ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[0])
except:
item = line.replace('\t', '').split(' ')
ras_status[current_driver][item[0]] = {}
ras_status[current_driver][item[0]]["uncorrected"] = int(line.replace('\t', '').split(' ')[2])
ras_status[current_driver]["total_errors"] += int(line.replace('\t', '').split(' ')[2])
else:
current_driver = ""
if "No" in line:
words = line.split(' ')
for word in words:
if word == "No":
continue
if "errors" in word:
current_driver = current_driver[:-1]
break
current_driver += word+" "
else:
words = line.split(' ')
for word in words:
if word == "events":
current_driver = current_driver[:-1]
break
if "errors" in word:
current_driver = current_driver[:-1]
break
current_driver += word+" "
ras_status[current_driver] = {}
ras_status[current_driver]["total_errors"] = 0
return ras_status
def readRAS():
result = subprocess.run(['ras-mc-ctl', '--summary'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
return convertToDict(result)
if __name__ == "__main__":
print(readRAS())

View File

@@ -1,3 +1,9 @@
# monitoring # Hardware monitoring suite
Monitoring suite for system and other stuff Set of python scripts for asserting health and stats of a linux system
# Configuration
See config-example.json
test

65
SMART.py Normal file
View File

@@ -0,0 +1,65 @@
import json
import subprocess
def getDevices():
result = subprocess.run(['smartctl', '--scan-open', '-j'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
devices = json.loads(result)
return devices
def getAttributes(device):
attributes = {}
attributes["data"] = {}
result = subprocess.run(['smartctl', device["name"], '-a', '-j'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
data = json.loads(result)
if "NVMe" == device["protocol"]:
attributes["sector_size"] = data["nvme_namespaces"][0]["formatted_lba_size"]
attributes["serial_number"] = data["serial_number"]
attributes["type"] = "NVME"
for attribute in data["nvme_smart_health_information_log"]:
attributes["data"][attribute] = data["nvme_smart_health_information_log"][attribute]
attributes["bytes_written"] = attributes["data"]["data_units_written"] * attributes["sector_size"] * 1000
elif "ATA" == device["protocol"]:
attributes["sector_size"] = data["physical_block_size"]
attributes["serial_number"] = data["serial_number"]
attributes["type"] = "ATA"
for attribute in data["ata_smart_attributes"]["table"]:
attributes["data"][attribute["name"]] = {}
try:
attributes["data"][attribute["name"]]["raw"] = int(attribute["raw"]["string"].split(' ')[0])
except:
attributes["data"][attribute["name"]]["raw"] = -1
attributes["data"][attribute["name"]]["id"] = int(attribute["id"])
attributes["data"][attribute["name"]]["value"] = int(attribute["value"])
attributes["data"][attribute["name"]]["worst"] = int(attribute["worst"])
attributes["data"][attribute["name"]]["thr"] = int(attribute["thresh"])
try:
attributes["bytes_written"] = attributes["data"]["Total_LBAs_Written"]["raw"] * attributes["sector_size"]
except:
attributes["bytes_written"] = -1
else:
pass
return attributes
def getAllDeviceAttributes():
devices = getDevices()
attributes = {}
for device in devices["devices"]:
attributes[device["name"]] = getAttributes(device)
return attributes
if __name__ == "__main__":
smart = getAllDeviceAttributes()
print(smart)
for device in smart:
print(smart[device]["serial_number"])

11
config-example.json Normal file
View File

@@ -0,0 +1,11 @@
{
"modules": ["sysinfo","docker","procMon"],
"volumes": {
"zfs": [],
"non_zfs": ["sda"]
},
"network": {
"nics": ["ens18"]
},
"gpu": "nVidia"
}

37
cpuinfo.py Normal file
View File

@@ -0,0 +1,37 @@
import subprocess
import json
def getFrequency():
result = subprocess.run(['cat', '/proc/cpuinfo'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8').split("\n")
freq = {}
index = 0
for line in result:
if "cpu MHz" in line:
line = line.split(":")
freq[str(index)] = float(line[-1])
index += 1
return freq
def getCpuInfo():
result = subprocess.run(['lscpu', '-J'], stdout=subprocess.PIPE)
result = json.loads(result.stdout.decode('utf-8'))
cpuinfo = {}
for item in result["lscpu"]:
if 'Vendor ID' in item['field']:
cpuinfo["vendor"] = item["data"]
if 'Model name' in item['field']:
cpuinfo["model"] = item["data"]
if item['field'] == 'CPU(s):':
cpuinfo["cpus"] = item["data"]
cpuinfo["frequency"] = getFrequency()
return cpuinfo
if __name__ == "__main__":
# print(getFrequency())
print(getCpuInfo())

138
docker.py Normal file
View File

@@ -0,0 +1,138 @@
import subprocess
import json
def health():
result = subprocess.run(['docker', 'ps', '-a', '--format', 'json', '--no-trunc'], stdout=subprocess.PIPE)
result = str(result.stdout)[:-1].replace("b'", "").replace('\\"', '').replace('\\', '').replace("u003e", ">").split('{"Command"')
ret_dict = {}
for container in result:
if container != "":
j = '{"Command"'+container[:-1]
data = json.loads(j)
ret_dict[data["Names"]] = {}
if data["State"] == "running":
ret_dict[data["Names"]]["status"] = 1
else:
ret_dict[data["Names"]]["status"] = 0
if "unhealthy" in data["Status"]:
ret_dict[data["Names"]]["health"] = 3
elif "Restarting" in data["Status"]:
ret_dict[data["Names"]]["health"] = 2
elif "healthy" in data["Status"]:
ret_dict[data["Names"]]["health"] = 0
elif "starting" in data["Status"]:
ret_dict[data["Names"]]["health"] = 1
else:
ret_dict[data["Names"]]["health"] = -1
return ret_dict
def _sizeConv(value):
number = ""
for d in value:
if d.isdigit():
number += d
elif d == '.':
number += d
else:
continue
number = float(number)
if "TB" in value:
return number * 10**12
elif "GB" in value:
return number * 10**9
elif "MB" in value:
return number * 10**6
elif "KB" in value:
return number * 10**3
else:
return number
def getSize():
result = subprocess.run(['docker', 'system', 'df', '--format', 'json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split('\n')
size = {}
for line in result:
a = json.loads(line)
size[a["Type"]] = {}
size[a["Type"]]["used"] = _sizeConv(a["Size"])
size[a["Type"]]["reclaimable"] = _sizeConv(a["Reclaimable"].split(" ")[0])
size[a["Type"]]["count_total"] = a["TotalCount"]
size[a["Type"]]["count_active"] = a["Active"]
return size
def getInfo():
docker = {}
result = subprocess.run(['docker', 'system', 'info', '--format', 'json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1]
info = json.loads(result)
docker["version"] = info["ServerVersion"]
result = subprocess.run(['docker', 'system', 'df', '-v', '--format', 'json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
info = json.loads(result)
docker["images"] = {}
docker["containers"] = {}
docker["volumes"] = {}
docker["buildcache"] = {}
for image in info["Images"]:
docker["images"][image["ID"]] = {}
docker["images"][image["ID"]]["containers"] = image["Containers"]
docker["images"][image["ID"]]["created"] = image["CreatedSince"]
docker["images"][image["ID"]]["repository"] = image["Repository"]
docker["images"][image["ID"]]["size"] = _sizeConv(image["Size"])
docker["images"][image["ID"]]["unique_size"] = image["UniqueSize"]
for container in info["Containers"]:
docker["containers"][container["ID"]] = {}
docker["containers"][container["ID"]]["created"] = container["CreatedAt"]
docker["containers"][container["ID"]]["image"] = container["Image"]
docker["containers"][container["ID"]]["volumes"] = container["LocalVolumes"]
docker["containers"][container["ID"]]["mounts"] = container["Mounts"]
try:
docker["containers"][container["ID"]]["name"] = container["Names"]
except:
docker["containers"][container["ID"]]["name"] = ""
try:
docker["containers"][container["ID"]]["networks"] = container["Networks"]
except:
docker["containers"][container["ID"]]["networks"] = ""
docker["containers"][container["ID"]]["runtime"] = container["RunningFor"]
docker["containers"][container["ID"]]["state"] = container["State"]
docker["containers"][container["ID"]]["size"] = _sizeConv(container["Size"])
docker["containers"][container["ID"]]["status"] = container["Status"]
for volume in info["Volumes"]:
docker["volumes"][volume["Name"]] = {}
docker["volumes"][volume["Name"]]["driver"] = volume["Driver"]
docker["volumes"][volume["Name"]]["links"] = volume["Links"]
docker["volumes"][volume["Name"]]["mountpoint"] = volume["Mountpoint"]
docker["volumes"][volume["Name"]]["size"] = _sizeConv(volume["Size"])
for build in info["BuildCache"]:
docker["buildcache"][build["ID"]] = {}
docker["buildcache"][build["ID"]]["type"] = build["CacheType"]
docker["buildcache"][build["ID"]]["created"] = build["CreatedSince"]
docker["buildcache"][build["ID"]]["in_use"] = build["InUse"]
docker["buildcache"][build["ID"]]["last_use"] = build["LastUsedSince"]
docker["buildcache"][build["ID"]]["shared"] = build["Shared"]
docker["buildcache"][build["ID"]]["size"] = _sizeConv(build["Size"])
docker["buildcache"][build["ID"]]["use_count"] = build["UsageCount"]
return docker
if __name__ == "__main__":
print(json.dumps(getSize()))
print(json.dumps(getInfo()))
#print(health())

266
gpu.py Normal file
View File

@@ -0,0 +1,266 @@
import xmltodict
import json
import subprocess
import os
import re
def _run_nvidia_smi():
result = subprocess.run(['nvidia-smi', '-q', '-x'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
return result
def _run_rocm_smi():
try:
result = subprocess.run(['rocm-smi', '-a', '--json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
except:
result = subprocess.run(['/opt/rocm/bin/rocm-smi', '-a', '--json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
result = json.loads(result)
return result
def _read_file_number(path):
with open(path, "r") as f:
number = f.read().split("\n")[0]
return float(number)
def _intel():
from time import sleep
stats = {}
with open("/sys/class/drm/card0/device/device", "r") as f:
stats["uuid"] = f.read().split("\n")[0]
result = subprocess.run(['lshw', '-c', 'display', '-json'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
a = json.loads(result)
result = subprocess.Popen(['intel_gpu_top', '-J'], stdout=subprocess.PIPE)
sleep(0.2)
result.kill()
c,d = result.communicate()
c = c.decode('utf-8')[3:]
c = json.loads(c)
stats["max_freq"] = _read_file_number("/sys/class/drm/card0/gt_max_freq_mhz") * 10**6
stats["min_freq"] = _read_file_number("/sys/class/drm/card0/gt_min_freq_mhz") * 10**6
stats["cur_freq"] = _read_file_number("/sys/class/drm/card0/gt_cur_freq_mhz") * 10**6
stats["power"] = c["power"]["GPU"]
stats["engine_3d"] = c["engines"]["Render/3D"]["busy"]
stats["engine_video"] = c["engines"]["Video"]["busy"]
stats["usage"] = (stats["engine_3d"] + stats["engine_video"]) / 2
stats["model"] = a[0]["product"]
stats["driver"] = a[0]["configuration"]["driver"]
return stats
def _getAmdGpuMemSize():
devs = os.listdir("/sys/class/drm/")
cards = {}
for i in devs:
card = re.findall("card[0-9]",i)
if card != []:
try:
with open("/sys/class/drm/"+card[0]+"/device/mem_info_vram_total", "r") as f:
mem = f.read()[:-1]
with open("/sys/class/drm/"+card[0]+"/device/device", "r") as f:
device = f.read()[:-1]
cards[device] = mem
except:
pass
return cards
def _getAmdGpuMaxFanspeed():
result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
devices = json.loads(result)
for device in devices:
if "amdgpu" in device:
for entry in devices[device]:
if "fan" in entry:
for rpm in devices[device][entry]:
if "max" in rpm:
return devices[device][entry][rpm]
return 1
def readGpu(vendor="nVidia"):
data = {}
data["about"] = {}
data["gpu"] = {}
if vendor == "nVidia":
whitelist = ["driver_version","cuda_version","product_name","uuid","vbios_version","fan_speed","performance_state"] #amd: valid?,not_valid,valid,valid,valid,valid,not_valid
smi = xmltodict.parse(_run_nvidia_smi())
uuid = smi["nvidia_smi_log"]["gpu"]["uuid"]
data["gpu"][uuid] = {}
data["gpu"][uuid]["throttle"] = {}
data["gpu"][uuid]["util"] = {}
for i in smi["nvidia_smi_log"]:
if i in whitelist:
data["about"][i] = smi["nvidia_smi_log"][i]
for i in smi["nvidia_smi_log"]["gpu"]:
if i in whitelist:
data["gpu"][uuid][i] = smi["nvidia_smi_log"]["gpu"][i]
for i in smi["nvidia_smi_log"]["gpu"]["clocks_event_reasons"]:
active = 1
if "Not Active" in smi["nvidia_smi_log"]["gpu"]["clocks_event_reasons"][i]:
active = 0
data["gpu"][uuid]["throttle"][i.replace("clocks_event_reason_", "")] = active
data["gpu"][uuid]["memory_total"] = float(smi["nvidia_smi_log"]["gpu"]["fb_memory_usage"]["total"].split(" ")[0])
data["gpu"][uuid]["memory_used"] = float(smi["nvidia_smi_log"]["gpu"]["fb_memory_usage"]["used"].split(" ")[0])
data["gpu"][uuid]["util"] = smi["nvidia_smi_log"]["gpu"]["utilization"]
data["gpu"][uuid]["temp"] = float(smi["nvidia_smi_log"]["gpu"]["temperature"]["gpu_temp"].split(" ")[0])
data["gpu"][uuid]["power"] = float(smi["nvidia_smi_log"]["gpu"]["gpu_power_readings"]["instant_power_draw"].split(" ")[0])
data["gpu"][uuid]["power_limit"] = float(smi["nvidia_smi_log"]["gpu"]["gpu_power_readings"]["current_power_limit"].split(" ")[0])
data["gpu"][uuid]["gpu_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["graphics_clock"].split(" ")[0])
data["gpu"][uuid]["mem_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["mem_clock"].split(" ")[0])
data["gpu"][uuid]["sm_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["sm_clock"].split(" ")[0])
data["gpu"][uuid]["video_clock"] = float(smi["nvidia_smi_log"]["gpu"]["clocks"]["video_clock"].split(" ")[0])
data["gpu"][uuid]["gpu_max_clock"] = float(smi["nvidia_smi_log"]["gpu"]["max_clocks"]["graphics_clock"].split(" ")[0])
data["gpu"][uuid]["mem_max_clock"] = float(smi["nvidia_smi_log"]["gpu"]["max_clocks"]["mem_clock"].split(" ")[0])
data["gpu"][uuid]["fan_speed"] = float(data["gpu"][uuid]["fan_speed"].split(" ")[0])
for i in data["gpu"][uuid]["util"]:
data["gpu"][uuid]["util"][i] = float(data["gpu"][uuid]["util"][i].split(" ")[0])
elif vendor == "AMD":
smi = _run_rocm_smi()
memsize = _getAmdGpuMemSize()
for card in smi:
if card == "system":
data["about"]["driver_version"] = smi["system"]["Driver version"]
else:
data["gpu"][smi[card]["Device ID"]] = {}
try:
data["gpu"][smi[card]["Device ID"]]["fan_speed"] = 100 * float(smi[card]["current_fan_speed (rpm)"]) / _getAmdGpuMaxFanspeed()
except:
data["gpu"][smi[card]["Device ID"]]["fan_speed"] = 0.0
data["gpu"][smi[card]["Device ID"]]["vbios_version"] = smi[card]["VBIOS version"]
data["gpu"][smi[card]["Device ID"]]["product_name"] = smi[card]["Device Name"]
data["gpu"][smi[card]["Device ID"]]["uuid"] = smi[card]["Unique ID"]
data["gpu"][smi[card]["Device ID"]]["memory_total"] = int(memsize[smi[card]["Device ID"]])
data["gpu"][smi[card]["Device ID"]]["memory_used"] = (int(memsize[smi[card]["Device ID"]]) / 100) * float(smi[card]["GPU Memory Allocated (VRAM%)"])
data["gpu"][smi[card]["Device ID"]]["temp"] = float(smi[card]["Temperature (Sensor edge) (C)"])
try:
data["gpu"][smi[card]["Device ID"]]["power"] = float(smi[card]["Average Graphics Package Power (W)"])
except:
try:
data["gpu"][smi[card]["Device ID"]]["power"] = float(smi[card]["Current Socket Graphics Package Power (W)"])
except:
data["gpu"][smi[card]["Device ID"]]["power"] = 0.0
try:
data["gpu"][smi[card]["Device ID"]]["power_limit"] = float(smi[card]["Max Graphics Package Power (W)"])
except:
data["gpu"][smi[card]["Device ID"]]["power_limit"] = 65.0
data["gpu"][smi[card]["Device ID"]]["gpu_clock"] = float(smi[card]["sclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6
data["gpu"][smi[card]["Device ID"]]["mem_clock"] = float(smi[card]["mclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6
try:
data["gpu"][smi[card]["Device ID"]]["sm_clock"] = float(smi[card]["current_dclk0 (MHz)"]) * 10**6
data["gpu"][smi[card]["Device ID"]]["video_clock"] = float(smi[card]["current_vclk0 (MHz)"]) * 10**6
except:
data["gpu"][smi[card]["Device ID"]]["sm_clock"] = 0.0
data["gpu"][smi[card]["Device ID"]]["video_clock"] = 0.0
data["gpu"][smi[card]["Device ID"]]["gpu_max_clock"] = float(smi[card]["Valid sclk range"].replace("Mhz","").split(" - ")[1]) * 10**6
try:
data["gpu"][smi[card]["Device ID"]]["mem_max_clock"] = float(smi[card]["Valid mclk range"].replace("Mhz","").split(" - ")[1]) * 10**6
except:
data["gpu"][smi[card]["Device ID"]]["mem_max_clock"] = float(smi[card]["mclk clock speed:"].replace("(","").replace("Mhz)","")) * 10**6
util = {}
util["gpu_util"] = float(smi[card]["GPU use (%)"])
util["memory_util"] = float(smi[card]["GPU Memory Allocated (VRAM%)"])
data["gpu"][smi[card]["Device ID"]]["throttle"] = {}
try:
data["gpu"][smi[card]["Device ID"]]["throttle"]["status"] = float(smi[card]["throttle_status"])
except:
data["gpu"][smi[card]["Device ID"]]["throttle"]["status"] = 1
data["gpu"][smi[card]["Device ID"]]["util"] = util
data["gpu"][smi[card]["Device ID"]]["performance_state"] = "N/A"
elif vendor == "Intel":
try:
intel = _intel()
except:
intel = {'uuid': 'Error', 'max_freq': 0.0, 'min_freq': 0.0, 'cur_freq': 0.0, 'power': 0.0, 'engine_3d': 0.0, 'engine_video': 0.0, 'usage': 0.0, 'model': 'Error', 'driver': 'Error'}
uuid = intel["uuid"]
data["gpu"][uuid] = {}
data["gpu"][uuid]["throttle"] = {}
data["gpu"][uuid]["util"] = {}
data["about"]["driver_version"] = intel["driver"]
data["gpu"][uuid]["throttle"]["status"] = 0
util = {}
util["gpu_util"] = intel["usage"]
util["memory_util"] = 0
data["gpu"][uuid]["util"] = util
data["gpu"][uuid]["fan_speed"] = 0
data["gpu"][uuid]["vbios_version"] = "0.0.0"
data["gpu"][uuid]["product_name"] = intel["model"]
data["gpu"][uuid]["uuid"] = uuid
data["gpu"][uuid]["memory_total"] = 1
data["gpu"][uuid]["memory_used"] = 0
data["gpu"][uuid]["temp"] = 0
data["gpu"][uuid]["power"] = intel["power"]
data["gpu"][uuid]["power_limit"] = 15
data["gpu"][uuid]["gpu_clock"] = intel["cur_freq"]
data["gpu"][uuid]["mem_clock"] = 0
data["gpu"][uuid]["sm_clock"] = 0
data["gpu"][uuid]["video_clock"] = 0
data["gpu"][uuid]["mem_max_clock"] = 1
data["gpu"][uuid]["gpu_max_clock"] = intel["max_freq"]
data["gpu"][uuid]["performance_state"] = "N/A"
else:
uuid = "unsupported"
data["gpu"][uuid] = {}
data["gpu"][uuid]["throttle"] = {}
data["gpu"][uuid]["util"] = {}
data["about"]["driver_version"] = "Unknown"
data["gpu"][uuid]["throttle"]["status"] = 0
util = {}
util["gpu_util"] = 0
util["memory_util"] = 0
data["gpu"][uuid]["util"] = util
data["gpu"][uuid]["fan_speed"] = 0
data["gpu"][uuid]["vbios_version"] = "0.0.0"
data["gpu"][uuid]["product_name"] = "Unknown"
data["gpu"][uuid]["uuid"] = uuid
data["gpu"][uuid]["memory_total"] = 1
data["gpu"][uuid]["memory_used"] = 0
data["gpu"][uuid]["temp"] = 0
data["gpu"][uuid]["power"] = 0
data["gpu"][uuid]["power_limit"] = 1
data["gpu"][uuid]["gpu_clock"] = 0
data["gpu"][uuid]["mem_clock"] = 0
data["gpu"][uuid]["sm_clock"] = 0
data["gpu"][uuid]["video_clock"] = 0
data["gpu"][uuid]["mem_max_clock"] = 1
data["gpu"][uuid]["gpu_max_clock"] = 1
data["gpu"][uuid]["performance_state"] = "N/A"
return data
if __name__ == "__main__":
#print(json.dumps(_run_rocm_smi(), indent=1))
#print(json.dumps(_getAmdGpuMemSize(), indent=1))
#print(json.dumps(readGpu(vendor="AMD"), indent=1))
_getAmdGpuMaxFanspeed()

8
hardware-monitor.service Normal file
View File

@@ -0,0 +1,8 @@
[Unit]
Description=Hardware monitor service
[Service]
ExecStart=/opt/monitoring/bin/python /opt/monitoring/main.py
[Install]
WantedBy=multi-user.target

115
intelPower.py Normal file
View File

@@ -0,0 +1,115 @@
import logging
import os.path
import argparse
from itertools import count
from struct import unpack
from time import sleep
from warnings import warn
from datetime import datetime
logger = logging.getLogger("intel-master")
class IntelPower:
RAPL_MSR_POWER_UNIT = 0x606
RAPL_MSR_ENERGY = 0x611
RAPL_MSR_PP0_ENERGY = 0x639
def __init__(self):
self._energy_unit = self._get_power_units()
self._package_topology = self._detect_physical_package_topology()
self._cores = list(self._package_topology.keys())
self._cores = sorted(self._cores)
self.timestamp = 0
self.package_energy = -1
self.core_energy = -1
def _read_msr(self, cpu_id, offset):
msr_file = "/dev/cpu/{}/msr".format(cpu_id)
try:
with open(msr_file, "rb", buffering=8192) as f:
f.seek(offset)
return self._decode_int64(f.read(8))
except PermissionError:
raise PermissionError("root privilege is required to read model-specific registers")
except FileNotFoundError:
raise FileNotFoundError("msr driver is not loaded, try \"sudo modprobe msr\" to load msr module")
@staticmethod
def _decode_int64(buffer):
return unpack("q", buffer)[0]
@staticmethod
def _detect_physical_package_topology():
cpu_package_mapping = {}
for cpu_id in count():
filename = "/sys/devices/system/cpu/cpu{}/topology/physical_package_id".format(cpu_id)
if os.path.isfile(filename):
with open(filename, "r") as f:
package_id = int(f.read())
logger.debug("detected cpu {} in socket {}".format(cpu_id, package_id))
cpu_package_mapping[cpu_id] = package_id
else:
return cpu_package_mapping
def _get_power_units(self):
power_unit = self._read_msr(0, self.RAPL_MSR_POWER_UNIT)
raw_unit = (power_unit >> 8) & 0x1F;
logger.debug("CPU energy unit is 1/2^{}".format(power_unit))
power_unit = 0.5 ** raw_unit
return power_unit
def _read_package_energy(self, cpu_id):
energy = self._read_msr(cpu_id, self.RAPL_MSR_ENERGY)
logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit))
return energy
def _read_core_energy(self, cpu_id):
energy = self._read_msr(cpu_id, self.RAPL_MSR_PP0_ENERGY)
logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit))
return energy
def _calc_power_wtime(self, before, after, duration):
return (after - before) * self._energy_unit / duration
def measure_nonblocking(self):
timestamp = datetime.now()
package_energy = {c: self._read_package_energy(c) for c in self._cores}
core_energy = {c: self._read_core_energy(c) for c in self._cores}
if self.package_energy != -1:
time_delta = (timestamp - self.timestamp).total_seconds()
package_power = {c: self._calc_power_wtime(self.package_energy[c], package_energy[c], time_delta) for c in self._cores}
core_power = {c: self._calc_power_wtime(self.core_energy[c], core_energy[c], time_delta) for c in self._cores}
else:
for c in self._cores:
package_power = {c: 0 for c in self._cores}
core_power = {c: 0 for c in self._cores}
self.timestamp = timestamp
self.package_energy = package_energy
self.core_energy = core_energy
avg_pp = 0
avg_cp = 0
for c in self._cores:
avg_pp += package_power[c]
avg_cp += core_power[c]
avg_pp = avg_pp / len(self._cores)
avg_cp = avg_cp / len(self._cores)
return avg_pp, avg_cp
def read(self):
return self._read_msr(1, self.RAPL_MSR_POWER_UNIT)
if __name__ == "__main__":
print(IntelPower().read())
print(IntelPower()._detect_physical_package_topology())
ip = IntelPower()
pp, cp = ip.measure_nonblocking()
print(pp, cp)
sleep(1)
pp, cp = ip.measure_nonblocking()
print(pp, cp)

65
kvmSensors.py Normal file
View File

@@ -0,0 +1,65 @@
import subprocess
def fans(data):
if "FAN" in data[0]:
fandata = {}
fandata["id"] = data[0]
fandata["rpm"] = data[1]
fandata["status"] = data[3]
return fandata
def voltage(data):
if "VOLT" in data[0]:
voltdata = {}
voltdata["id"] = data[0]
voltdata["voltage"] = data[1]
voltdata["status"] = data[3]
return voltdata
def temp(data):
if "TEMP" in data[0]:
tempdata = {}
tempdata["id"] = data[0]
tempdata["temp"] = data[1]
tempdata["status"] = data[3]
return tempdata
def readSensors():
result = subprocess.run(['ipmitool', 'sensor'], stdout=subprocess.PIPE)
result = str(result.stdout).replace(" ", "").replace("b'", "").replace("'", "").split("\\n")
retdata = {}
fandata = {}
tempdata = {}
voltdata = {}
fanindex = 0
voltindex = 0
tempindex = 0
for sensor in result:
data = sensor.split("|")
try:
if data[1] != "na":
if data[1] != "0x0":
# print(data)
if fans(data) != None:
fandata[str(fanindex)] = fans(data)
fanindex += 1
if voltage(data) != None:
voltdata[str(voltindex)] = voltage(data)
voltindex += 1
if temp(data) != None:
tempdata[str(tempindex)] = temp(data)
tempindex += 1
except: # Exception as e: print(e)
pass
retdata["fans"] = fandata
retdata["volt"] = voltdata
retdata["temp"] = tempdata
return retdata
if __name__ == "__main__":
print(readSensors())

690
main.py Normal file
View File

@@ -0,0 +1,690 @@
from time import sleep
import datetime
import sys
import json
import os
import promMon
import mon_pkg_update
#===INIT========================================================================
filedir = os.path.dirname(os.path.abspath(sys.argv[0]))
vols_to_scan = {}
vols_to_scan["zfs"] = []
vols_to_scan["non_zfs"] = []
nics_to_scan = []
SMART_data_update_period_seconds = 60
last_SMART_read_timestamp = 0
packages_data_update_period_seconds = 3600
last_packages_read_timestamp = 0
docker_data_update_period_seconds = 3600
last_docker_read_timestamp = 0
update_scan_period_seconds = 300
last_update_scan_timestamp = 0
update_from_git_tags = True
gpu_vendor = "nVidia"
# load config.json file
try:
with open(filedir+"/config.json", "r") as f:
config = json.loads(f.read())
for module in config["modules"]:
if "ryzenPower" in module:
import ryzenPower
if "kvmSensors" in module:
import kvmSensors as ks
if "sysinfo" in module:
import sysinfo as si
if "docker" in module:
import docker as do
if "SMART" in module:
import SMART as sm
if "sensors" in module:
import sensors as lmsn
if "intelPower" in module:
import intelPower
if "procMon" in module:
import procMon as pr
if "RAS" in module:
import RAS
if "cpuinfo" in module:
import cpuinfo as cinfo
if "gpu" in module:
import gpu
if "packages" in module:
import packages
try:
if len(config["volumes"]["zfs"]) != 0:
for i in config["volumes"]["zfs"]:
vols_to_scan["zfs"].append(i)
if len(config["volumes"]["non_zfs"]) != 0:
for i in config["volumes"]["non_zfs"]:
vols_to_scan["non_zfs"].append(i)
except:
print("No volumes object detected")
try:
if len(config["network"]["nics"]) != 0:
for nic in config["network"]["nics"]:
nics_to_scan.append(nic)
except:
print("No network object detected")
try:
gpu_vendor = config["gpu"]
except:
pass
except:
print("No config file found, exitting")
exit(255)
proc_last = 0
if "promMon" not in sys.modules:
print("Prometheus exporter helper not inported, exitting")
exit(1)
pm = promMon.prometheus(port=9339, name="hw-monitor")
pm.add_monitor("self_loop_time", "us")
pm.add_monitor("version", "", tags=("version",))
if "docker" in sys.modules:
pm.add_monitor("docker_version", "")
pm.add_monitor("docker_overall_info", "", tags=("module","type"))
pm.add_monitor("docker_overall_size", "B", tags=("module","type"))
pm.add_monitor("docker_images", "", tags=("id","containers","created","repository","unique_size"))
pm.add_monitor("docker_containers", "", tags=("id","created","image","volumes","mounts","name","networks","runtime","state","status"))
pm.add_monitor("docker_volumes", "", tags=("name","driver","links","mountpoint"))
pm.add_monitor("docker_build_cache", "", tags=("id","type","created","in_use","last_use","shared","use_count"))
if "ryzenPower" in sys.modules:
pm.add_monitor("package_power", "W")
pm.add_monitor("core_total", "W")
pm.add_monitor("core", "W")
rp = ryzenPower.RyzenPower()
if "intelPower" in sys.modules:
ip = intelPower.IntelPower()
if "kvmSensors" in sys.modules:
pm.add_monitor("fan_rpm", "RPM")
pm.add_monitor("fan_ok", "")
pm.add_monitor("temp_celsius", "C")
pm.add_monitor("temp_ok", "")
pm.add_monitor("voltage", "V")
pm.add_monitor("voltage_ok", "")
if "cpuinfo" in sys.modules:
pm.add_monitor("cpu_info", "", tags=("vendor","model","cpus"))
if "sysinfo" in sys.modules:
pm.add_monitor("disk_read", "B/s")
pm.add_monitor("disk_write", "B/s")
pm.add_monitor("disk_io_read", "iops")
pm.add_monitor("disk_io_write", "iops")
pm.add_monitor("disk_io_read_time", "ms")
pm.add_monitor("disk_io_write_time", "ms")
pm.add_monitor("disk_io_read_merged", "")
pm.add_monitor("disk_io_write_merged", "")
pm.add_monitor("disk_busy", "ms")
pm.add_monitor("cpu_count", "")
pm.add_monitor("cpu_frequency", "Hz")
pm.add_monitor("cpu_usage", "%")
pm.add_monitor("uptime", "s")
pm.add_monitor("system_info", "", tags=("hostname","kernel","board"))
pm.add_monitor("ip_addrs", "", tags=("interface","ip",))
pm.add_monitor("user_sessions", "")
pm.add_monitor("users", "", tags=("user","from"))
if "SMART" in sys.modules:
pm.add_monitor("smart_attributes", "", tags=("serial","device","attribute","id","value","thres","worst","raw"))
if "gpu" in sys.modules:
pm.add_monitor("gpu_info", "", tags=("name","vbios","driver","pstate","uuid"))
pm.add_monitor("gpu_util", "%", tags=("name","uuid","stat"))
pm.add_monitor("gpu_throttle", "", tags=("name","uuid","stat"))
pm.add_monitor("gpu_memory_used", "MB", tags=("name","uuid"))
pm.add_monitor("gpu_memory_total", "MB", tags=("name","uuid"))
pm.add_monitor("gpu_power", "W", tags=("name","uuid"))
pm.add_monitor("gpu_power_limit", "W", tags=("name","uuid"))
pm.add_monitor("gpu_temp", "C", tags=("name","uuid"))
pm.add_monitor("gpu_fan_speed", "%", tags=("name","uuid"))
pm.add_monitor("gpu_gpu_clock", "Hz", tags=("name","uuid"))
pm.add_monitor("gpu_mem_clock", "Hz", tags=("name","uuid"))
pm.add_monitor("gpu_sm_clock", "Hz", tags=("name","uuid"))
pm.add_monitor("gpu_video_clock", "Hz", tags=("name","uuid"))
pm.add_monitor("gpu_gpu_clock_max", "Hz", tags=("name","uuid"))
pm.add_monitor("gpu_mem_clock_max", "Hz", tags=("name","uuid"))
if "packages" in sys.modules:
pm.add_monitor("update_pkg_count", "", tags=("package_mgr",))
pm.add_monitor("update_pkg_updatable", "", tags=("package","version","repository"))
pm.add_monitor("installed_pkg_count", "", tags=("package_mgr",))
pm.add_monitor("installed_pkgs", "", tags=("package","version","repository"))
pm.add_monitor("installed_pkg_size", "", tags=("package_mgr",))
def self_monitoring(name, start):
end = datetime.datetime.now()
task_time = end - start
pm.monitor("self_loop_time", (name,), task_time.microseconds + (task_time.seconds * 1000000))
return datetime.datetime.now()
while True:
mon_time = 0
loop_start = datetime.datetime.now()
#===VARIABLE INIT===============================================================
if "ryzenPower" in sys.modules:
package, cores = rp.measure_nonblocking()
mon_time = self_monitoring("zenpower", loop_start if mon_time == 0 else mon_time)
if "kvmSensors" in sys.modules:
kvm = ks.readSensors()
mon_time = self_monitoring("kvm", loop_start if mon_time == 0 else mon_time)
if "sysinfo" in sys.modules:
disks = si.getDisk()
mon_time = self_monitoring("disks", loop_start if mon_time == 0 else mon_time)
cpu = si.getCPU()
mon_time = self_monitoring("cpu", loop_start if mon_time == 0 else mon_time)
memory = si.getMemory()
mon_time = self_monitoring("memory", loop_start if mon_time == 0 else mon_time)
try:
partitions = si.getPartitions()
mon_time = self_monitoring("partitions", loop_start if mon_time == 0 else mon_time)
except:
partitions = 0
try:
zfs = si.getZFS()
mon_time = self_monitoring("zfs", loop_start if mon_time == 0 else mon_time)
except:
zfs = 0
uptime = si.getUptime()
mon_time = self_monitoring("uptime", loop_start if mon_time == 0 else mon_time)
network = si.getNetwork()
ipaddrs = si.getIP()
mon_time = self_monitoring("network", loop_start if mon_time == 0 else mon_time)
systeminfo = si.getSysInfo()
mon_time = self_monitoring("systeminfo", loop_start if mon_time == 0 else mon_time)
users = si.users()
mon_time = self_monitoring("users", loop_start if mon_time == 0 else mon_time)
if "SMART" in sys.modules:
if last_SMART_read_timestamp == 0:
smart = sm.getAllDeviceAttributes()
last_SMART_read_timestamp = datetime.datetime.now()
else:
timedelta = (datetime.datetime.now() - last_SMART_read_timestamp).total_seconds()
if timedelta >= SMART_data_update_period_seconds:
smart = sm.getAllDeviceAttributes()
last_SMART_read_timestamp = datetime.datetime.now()
mon_time = self_monitoring("smart", loop_start if mon_time == 0 else mon_time)
if "docker" in sys.modules:
containers = do.health()
try:
if last_docker_read_timestamp == 0:
docker_info = do.getInfo()
docker_size = do.getSize()
last_docker_read_timestamp = datetime.datetime.now()
else:
timedelta = (datetime.datetime.now() - last_docker_read_timestamp).total_seconds()
if timedelta >= docker_data_update_period_seconds:
docker_info = do.getInfo()
docker_size = do.getSize()
last_docker_read_timestamp = datetime.datetime.now()
except Exception as e:
print(e)
mon_time = self_monitoring("containers", loop_start if mon_time == 0 else mon_time)
if "sensors" in sys.modules:
temperatures = lmsn.getTemps()
mon_time = self_monitoring("temperatures", loop_start if mon_time == 0 else mon_time)
fans = lmsn.getFans()
mon_time = self_monitoring("fans", loop_start if mon_time == 0 else mon_time)
voltages = lmsn.getVoltages()
mon_time = self_monitoring("voltage", loop_start if mon_time == 0 else mon_time)
if "intelPower" in sys.modules:
package, cores = ip.measure_nonblocking()
mon_time = self_monitoring("cpu_power", loop_start if mon_time == 0 else mon_time)
if "RAS" in sys.modules:
ras = RAS.readRAS()
mon_time = self_monitoring("ras", loop_start if mon_time == 0 else mon_time)
if "procMon" in sys.modules:
proc = pr.exportProcesses(proc_last)
proc_last = proc
mon_time = self_monitoring("proc", loop_start if mon_time == 0 else mon_time)
if "cpuinfo" in sys.modules:
cpuinfo = cinfo.getCpuInfo()
mon_time = self_monitoring("cpuinfo", loop_start if mon_time == 0 else mon_time)
if "gpu" in sys.modules:
gpuinfo = gpu.readGpu(gpu_vendor)
mon_time = self_monitoring("gpu", loop_start if mon_time == 0 else mon_time)
if "packages" in sys.modules:
if last_packages_read_timestamp == 0:
up = packages.getPackages()
last_packages_read_timestamp = datetime.datetime.now()
else:
timedelta = (datetime.datetime.now() - last_packages_read_timestamp).total_seconds()
if timedelta >= packages_data_update_period_seconds:
up = packages.getPackages()
last_packages_read_timestamp = datetime.datetime.now()
mon_time = self_monitoring("packages", loop_start if mon_time == 0 else mon_time)
if last_update_scan_timestamp == 0:
version = mon_pkg_update.getCurrentTag()
if update_from_git_tags:
mon_pkg_update.update()
last_update_scan_timestamp = datetime.datetime.now()
else:
timedelta = (datetime.datetime.now() - last_update_scan_timestamp).total_seconds()
if timedelta >= update_scan_period_seconds:
version = mon_pkg_update.getCurrentTag()
if update_from_git_tags:
mon_pkg_update.update()
last_update_scan_timestamp = datetime.datetime.now()
mon_time = self_monitoring("version_check", loop_start if mon_time == 0 else mon_time)
#===MOVE VARS TO PROMETHEUS EXPORTER============================================
pm.monitor("version", (version,), 1)
if "sysinfo" in sys.modules:
pm.monitor("uptime", ("sensors",), uptime)
pm.monitor("cpu_count", ("sensors",), cpu["cpu_count"])
pm.monitor("cpu_usage", ("sensors",), cpu["usage"])
for disk in disks:
pm.monitor("disk_read", (disk,), disks[disk]["read"])
pm.monitor("disk_write", (disk,), disks[disk]["write"])
pm.monitor("disk_io_read", (disk,), disks[disk]["io_read"])
pm.monitor("disk_io_write", (disk,), disks[disk]["io_write"])
pm.monitor("disk_io_read_time", (disk,), disks[disk]["io_read_time"])
pm.monitor("disk_io_write_time", (disk,), disks[disk]["io_write_time"])
pm.monitor("disk_io_read_merged", (disk,), disks[disk]["io_read_merged"])
pm.monitor("disk_io_write_merged", (disk,), disks[disk]["io_write_merged"])
pm.monitor("disk_busy", (disk,), disks[disk]["busy"])
for core in cpu["frequency"]:
pm.monitor("cpu_frequency", (str(core),), cpu["frequency"][core])
for t in cpu["time_percent"]:
try:
pm.monitor("cpu_time_percent", (t,), cpu["time_percent"][t])
except:
pm.add_monitor("cpu_time_percent", "%")
pm.monitor("cpu_time_percent", (t,), cpu["time_percent"][t])
for i in memory:
try:
pm.monitor("memory_"+i, ("memory",), memory[i])
except:
pm.add_monitor("memory_"+i, "%")
pm.monitor("memory_"+i, ("memory",), memory[i])
if zfs != 0:
for pool in zfs:
if len(vols_to_scan["zfs"]) != 0:
if pool not in vols_to_scan["zfs"]:
continue
try:
pm.monitor("zfs_state", (pool,), zfs[pool]["state"])
pm.monitor("zfs_size", (pool,), zfs[pool]["size"])
pm.monitor("zfs_used", (pool,), zfs[pool]["used"])
pm.monitor("zfs_free", (pool,), zfs[pool]["free"])
pm.monitor("zfs_fragmentation", (pool,), zfs[pool]["fragmentation"])
pm.monitor("zfs_dedup", (pool,), zfs[pool]["dedup"])
except:
pm.add_monitor("zfs_state", "")
pm.add_monitor("zfs_size", "B")
pm.add_monitor("zfs_used", "B")
pm.add_monitor("zfs_free", "B")
pm.add_monitor("zfs_fragmentation", "%")
pm.add_monitor("zfs_dedup", "")
pm.monitor("zfs_state", (pool,), zfs[pool]["state"])
pm.monitor("zfs_size", (pool,), zfs[pool]["size"])
pm.monitor("zfs_used", (pool,), zfs[pool]["used"])
pm.monitor("zfs_free", (pool,), zfs[pool]["free"])
pm.monitor("zfs_fragmentation", (pool,), zfs[pool]["fragmentation"])
pm.monitor("zfs_dedup", (pool,), zfs[pool]["dedup"])
for nic in network:
if len(nics_to_scan) != 0:
if nic not in nics_to_scan:
continue
try:
pm.monitor("network_rx", (nic,), network[nic]["rx"])
pm.monitor("network_tx", (nic,), network[nic]["tx"])
pm.monitor("network_err_rx", (nic,), network[nic]["err_rx"])
pm.monitor("network_err_tx", (nic,), network[nic]["err_tx"])
pm.monitor("network_drop_rx", (nic,), network[nic]["drop_rx"])
pm.monitor("network_drop_tx", (nic,), network[nic]["drop_tx"])
pm.monitor("network_packet_rx", (nic,), network[nic]["packet_rx"])
pm.monitor("network_packet_tx", (nic,), network[nic]["packet_tx"])
except:
pm.add_monitor("network_rx", "B")
pm.add_monitor("network_tx", "B")
pm.add_monitor("network_err_rx", "")
pm.add_monitor("network_err_tx", "")
pm.add_monitor("network_drop_rx", "")
pm.add_monitor("network_drop_tx", "")
pm.add_monitor("network_packet_rx", "")
pm.add_monitor("network_packet_tx", "")
pm.monitor("network_rx", (nic,), network[nic]["rx"])
pm.monitor("network_tx", (nic,), network[nic]["tx"])
pm.monitor("network_err_rx", (nic,), network[nic]["err_rx"])
pm.monitor("network_err_tx", (nic,), network[nic]["err_tx"])
pm.monitor("network_drop_rx", (nic,), network[nic]["drop_rx"])
pm.monitor("network_drop_tx", (nic,), network[nic]["drop_tx"])
pm.monitor("network_packet_rx", (nic,), network[nic]["packet_rx"])
pm.monitor("network_packet_tx", (nic,), network[nic]["packet_tx"])
if partitions != 0:
for part in partitions:
if len(vols_to_scan["non_zfs"]) != 0:
if part not in vols_to_scan["non_zfs"]:
continue
try:
pm.monitor("partition_size", (part,), partitions[part]["size"])
pm.monitor("partition_used", (part,), partitions[part]["used"])
pm.monitor("partition_free", (part,), partitions[part]["free"])
except:
pm.add_monitor("partition_size", "B")
pm.add_monitor("partition_used", "B")
pm.add_monitor("partition_free", "B")
pm.monitor("partition_size", (part,), partitions[part]["size"])
pm.monitor("partition_used", (part,), partitions[part]["used"])
pm.monitor("partition_free", (part,), partitions[part]["free"])
pm.monitor("system_info", (systeminfo["hostname"],systeminfo["kernel"],systeminfo["board"]), 1)
pm.delete_monitor("user_sessions")
pm.delete_monitor("users")
try:
for user in users:
pm.monitor("user_sessions", (user,), users[user]["sessions"])
for session in users[user]["session"]:
pm.monitor("users", (user,users[user]["session"][session]["from"]), 1)
except:
pass
pm.delete_monitor("ip_addrs")
for interface in ipaddrs:
pm.monitor("ip_addrs", (interface,ipaddrs[interface]), 1)
if "ryzenPower" in sys.modules:
pm.monitor("package_power", ("sensors",), package)
core_total = 0
for core in cores:
pm.monitor("core", (str(int(core/2)),), cores[core])
core_total += cores[core]
pm.monitor("core_total", ("sensors",), core_total)
if "kvmSensors" in sys.modules:
for fan in kvm["fans"]:
pm.monitor("fan_rpm", (kvm["fans"][fan]["id"],), kvm["fans"][fan]["rpm"])
if kvm["fans"][fan]["status"] == "ok":
pm.monitor("fan_ok", (kvm["fans"][fan]["id"],), 1)
else:
pm.monitor("fan_ok", (kvm["fans"][fan]["id"],), 0)
for temp in kvm["temp"]:
pm.monitor("temp_celsius", (kvm["temp"][temp]["id"],), kvm["temp"][temp]["temp"])
if kvm["temp"][temp]["status"] == "ok":
pm.monitor("temp_ok", (kvm["temp"][temp]["id"],), 1)
else:
pm.monitor("temp_ok", (kvm["temp"][temp]["id"],), 0)
for volt in kvm["volt"]:
pm.monitor("voltage", (kvm["volt"][volt]["id"],), kvm["volt"][volt]["voltage"])
if kvm["volt"][volt]["status"] == "ok":
pm.monitor("voltage_ok", (kvm["volt"][volt]["id"],), 1)
else:
pm.monitor("voltage_ok", (kvm["volt"][volt]["id"],), 0)
if "docker" in sys.modules:
pm.delete_monitor("docker_version")
pm.delete_monitor("docker_images")
pm.delete_monitor("docker_containers")
pm.delete_monitor("docker_volumes")
pm.delete_monitor("docker_build_cache")
for container in containers:
try:
pm.monitor("docker_status", (container,), containers[container]["status"])
pm.monitor("docker_health", (container,), containers[container]["health"])
except:
pm.add_monitor("docker_status", "")
pm.add_monitor("docker_health", "")
pm.monitor("docker_status", (container,), containers[container]["status"])
pm.monitor("docker_health", (container,), containers[container]["health"])
try:
pm.monitor("docker_version", (docker_info["version"],), 1)
for module in docker_size:
pm.monitor("docker_overall_info", (module,"count_total"), int(docker_size[module]["count_total"]))
pm.monitor("docker_overall_info", (module,"count_active"), int(docker_size[module]["count_active"]))
pm.monitor("docker_overall_size", (module,"used"), docker_size[module]["used"])
pm.monitor("docker_overall_size", (module,"reclaimable"), docker_size[module]["reclaimable"])
for image in docker_info["images"]:
pm.monitor("docker_images", (image,docker_info["images"][image]["containers"],docker_info["images"][image]["created"],docker_info["images"][image]["repository"],docker_info["images"][image]["unique_size"]), docker_info["images"][image]["size"])
for container in docker_info["containers"]:
pm.monitor("docker_containers", (container,docker_info["containers"][container]["created"],docker_info["containers"][container]["image"],docker_info["containers"][container]["volumes"],docker_info["containers"][container]["mounts"],docker_info["containers"][container]["name"],docker_info["containers"][container]["networks"],docker_info["containers"][container]["runtime"],docker_info["containers"][container]["state"],docker_info["containers"][container]["status"]), docker_info["containers"][container]["size"])
for volume in docker_info["volumes"]:
pm.monitor("docker_volumes", (volume,docker_info["volumes"][volume]["driver"],docker_info["volumes"][volume]["links"],docker_info["volumes"][volume]["mountpoint"]), docker_info["volumes"][volume]["size"])
for cache in docker_info["buildcache"]:
pm.monitor("docker_build_cache", (cache,docker_info["buildcache"][cache]["type"],docker_info["buildcache"][cache]["created"],docker_info["buildcache"][cache]["in_use"],docker_info["buildcache"][cache]["last_use"],docker_info["buildcache"][cache]["shared"],docker_info["buildcache"][cache]["use_count"]), docker_info["buildcache"][cache]["size"])
except Exception as e:
print(e)
if "sensors" in sys.modules:
for temp in temperatures:
try:
for sensor in temperatures[temp]:
if "coretemp" in temp:
pm.monitor("temp_celsius_lm", (str(sensor),), temperatures[temp][sensor])
else:
pm.monitor("temp_celsius_lm", (str(temp)+'_'+str(sensor),), temperatures[temp][sensor])
except:
pm.add_monitor("temp_celsius_lm", "C")
for sensor in temperatures[temp]:
if "coretemp" in temp:
pm.monitor("temp_celsius_lm", (str(sensor),), temperatures[temp][sensor])
else:
pm.monitor("temp_celsius_lm", (str(temp)+'_'+str(sensor),), temperatures[temp][sensor])
for fan in fans:
try:
for sensor in fans[fan]:
pm.monitor("fans_lm", (str(sensor),), fans[fan][sensor])
except:
pm.add_monitor("fans_lm", "RPM")
for sensor in fans[fan]:
pm.monitor("fans_lm", (str(sensor),), fans[fan][sensor])
for voltage in voltages:
try:
for sensor in voltages[voltage]:
pm.monitor("voltages_lm", (str(sensor),), voltages[voltage][sensor])
except:
pm.add_monitor("voltages_lm", "V")
for sensor in voltages[voltage]:
pm.monitor("voltages_lm", (str(sensor),), voltages[voltage][sensor])
if "intelPower" in sys.modules:
try:
pm.monitor("intel_cpu_power", ("core",), cores)
pm.monitor("intel_cpu_power", ("package",), package)
except:
pm.add_monitor("intel_cpu_power", "W")
pm.monitor("intel_cpu_power", ("core",), cores)
pm.monitor("intel_cpu_power", ("package",), package)
if "RAS" in sys.modules:
for item in ras:
try:
for tag in ras[item]:
if tag == "total_errors":
pm.monitor("ras_total", (item.replace(' ','_'),), ras[item][tag])
else:
for error in ras[item][tag]:
pm.monitor("ras_"+error, (item.replace(' ','_')+"_"+tag,), ras[item][tag][error])
except:
for tag in ras[item]:
if tag == "total_errors":
try:
pm.add_monitor("ras_total", "")
pm.monitor("ras_total", (item.replace(' ','_'),), ras[item][tag])
except:
pass
else:
for error in ras[item][tag]:
try:
pm.add_monitor("ras_"+error, "")
pm.monitor("ras_"+error, (item.replace(' ','_')+"_"+tag,), ras[item][tag][error])
except:
pass
if "SMART" in sys.modules:
try:
pm.add_monitor("smart_bytes_written", "B", tags=("serial","instance"))
except:
pass
pm.delete_monitor("smart_attributes")
for device in smart:
serial = smart[device]["serial_number"]
pm.monitor("smart_bytes_written", (serial,device), smart[device]["bytes_written"])
for attr in smart[device]["data"]:
if smart[device]["type"] == "ATA":
try:
pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]["raw"])
pm.monitor("smart_value_"+attr, (serial,device), smart[device]["data"][attr]["value"])
pm.monitor("smart_thr_"+attr, (serial,device), smart[device]["data"][attr]["thr"])
pm.monitor("smart_worst_"+attr, (serial,device), smart[device]["data"][attr]["worst"])
pm.monitor("smart_attributes", (serial,device,attr,smart[device]["data"][attr]["id"],smart[device]["data"][attr]["value"],smart[device]["data"][attr]["thr"],smart[device]["data"][attr]["worst"],smart[device]["data"][attr]["raw"]), smart[device]["data"][attr]["raw"])
except:
try:
pm.add_monitor("smart_raw_"+attr, "", tags=("serial","instance"))
pm.add_monitor("smart_value_"+attr, "", tags=("serial","instance"))
pm.add_monitor("smart_thr_"+attr, "", tags=("serial","instance"))
pm.add_monitor("smart_worst_"+attr, "", tags=("serial","instance"))
pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr]["raw"])
pm.monitor("smart_value_"+attr, (serial,device), smart[device]["data"][attr]["value"])
pm.monitor("smart_thr_"+attr, (serial,device), smart[device]["data"][attr]["thr"])
pm.monitor("smart_worst_"+attr, (serial,device), smart[device]["data"][attr]["worst"])
except:
pass
else:
try:
pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr])
pm.monitor("smart_attributes", (serial,device,attr,"","","","",smart[device]["data"][attr]), smart[device]["data"][attr])
except:
try:
pm.add_monitor("smart_raw_"+attr, "", tags=("serial","instance"))
pm.monitor("smart_raw_"+attr, (serial,device), smart[device]["data"][attr])
except:
pass
if "procMon" in sys.modules:
try:
pm.add_monitor("proc_summary", "", tags=("PID","CPU","VIRT","RAM","% RAM","THR","STARTTIME","RUNTIME","PARENT","STATE","COMM"))
pm.add_monitor("proc_cpu", "%")
pm.add_monitor("proc_memory_used", "B")
pm.add_monitor("proc_memory_virt", "B")
pm.add_monitor("proc_memory_percent", "%")
pm.add_monitor("proc_page_fault_minor", "")
pm.add_monitor("proc_page_fault_major", "")
except:
pass
pm.delete_monitor("proc_summary")
pm.delete_monitor("proc_cpu")
pm.delete_monitor("proc_memory_used")
pm.delete_monitor("proc_memory_virt")
pm.delete_monitor("proc_memory_percent")
pm.delete_monitor("proc_page_fault_minor")
pm.delete_monitor("proc_page_fault_major")
for pid in proc:
try:
pm.monitor("proc_summary", (proc[pid]["id"],proc[pid]["cpu"],str(int(proc[pid]["virt"]) / 1000)+" kB",str(int(proc[pid]["memory"]) / 1000)+" kB",round(float(proc[pid]["memory_percent"]),2),proc[pid]["threadcnt"],proc[pid]["starttime"],proc[pid]["runtime_seconds"],proc[pid]["parent_pid"],proc[pid]["state"],proc[pid]["comm"]), 1)
pm.monitor("proc_cpu", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["cpu"])
pm.monitor("proc_memory_used", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["memory"])
pm.monitor("proc_memory_virt", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["virt"])
pm.monitor("proc_memory_percent", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["memory_percent"])
pm.monitor("proc_page_fault_minor", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["page_fault_minor"])
pm.monitor("proc_page_fault_major", (str(proc[pid]["id"])+"_"+proc[pid]["comm"],), proc[pid]["page_fault_major"])
except:
pass
if "cpuinfo" in sys.modules:
pm.monitor("cpu_info", (cpuinfo["vendor"],cpuinfo["model"],cpuinfo["cpus"]), 1)
if "gpu" in sys.modules:
pm.delete_monitor("gpu_info")
pm.delete_monitor("gpu_util")
pm.delete_monitor("gpu_throttle")
for device in gpuinfo["gpu"]:
pm.monitor("gpu_info", (gpuinfo["gpu"][device]["product_name"], gpuinfo["gpu"][device]["vbios_version"], gpuinfo["about"]["driver_version"], gpuinfo["gpu"][device]["performance_state"], device), 1)
for item in gpuinfo["gpu"][device]["util"]:
pm.monitor("gpu_util", (gpuinfo["gpu"][device]["product_name"], device, item), gpuinfo["gpu"][device]["util"][item])
for item in gpuinfo["gpu"][device]["throttle"]:
pm.monitor("gpu_throttle", (gpuinfo["gpu"][device]["product_name"], device, item), gpuinfo["gpu"][device]["throttle"][item])
pm.monitor("gpu_fan_speed", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["fan_speed"])
pm.monitor("gpu_memory_used", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["memory_used"])
pm.monitor("gpu_memory_total", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["memory_total"])
pm.monitor("gpu_temp", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["temp"])
pm.monitor("gpu_power", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["power"])
pm.monitor("gpu_power_limit", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["power_limit"])
pm.monitor("gpu_gpu_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["gpu_clock"])
pm.monitor("gpu_mem_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["mem_clock"])
pm.monitor("gpu_sm_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["sm_clock"])
pm.monitor("gpu_video_clock", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["video_clock"])
pm.monitor("gpu_gpu_clock_max", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["gpu_max_clock"])
pm.monitor("gpu_mem_clock_max", (gpuinfo["gpu"][device]["product_name"], device), gpuinfo["gpu"][device]["mem_max_clock"])
if "packages" in sys.modules:
pm.delete_monitor("update_pkg_updatable")
pm.delete_monitor("update_pkg_count")
pm.delete_monitor("installed_pkg_count")
pm.delete_monitor("installed_pkgs")
pm.delete_monitor("installed_pkg_size")
pm.monitor("update_pkg_count", (up["package_mgr"],), len(up["updatable"]))
pm.monitor("installed_pkg_count", (up["package_mgr"],), len(up["installed"]))
pm.monitor("installed_pkg_size", (up["package_mgr"],), up["total_size"])
try:
for package in up["installed"]:
pm.monitor("installed_pkgs", (package, up["installed"][package]["version"], up["installed"][package]["repository"]), up["installed"][package]["size"])
except:
pass
if len(up["updatable"]) > 0:
for package in up["updatable"]:
pm.monitor("update_pkg_updatable", (package, up["updatable"][package]["version"], up["updatable"][package]["repository"]), 1)
mon_time = self_monitoring("prom_export", loop_start if mon_time == 0 else mon_time)
loop_time_spent = self_monitoring("hw-mon-loop", loop_start)
if (loop_start - datetime.datetime.now()).total_seconds() < 1:
sleep(1 - (loop_start - datetime.datetime.now()).total_seconds())

46
mon_pkg_update.py Normal file
View File

@@ -0,0 +1,46 @@
import subprocess
def pullTags():
result = subprocess.run(['git', 'fetch', '--tags'], stdout=subprocess.PIPE)
def getCurrentTag():
result = subprocess.run(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1]
result = subprocess.run(['git', 'name-rev', '--tags', '--name-only', result], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1]
return result
def checkTag():
pullTags()
result = subprocess.run(['git', 'tag'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split('\n')
latest_tag = getCurrentTag()
for tag in result:
if float(tag[1:]) > float(latest_tag[1:]):
latest_tag = tag
return latest_tag
def update(recursive=False):
current_tag = getCurrentTag()
latest_tag = checkTag()
if recursive:
print(current_tag, latest_tag)
if current_tag != latest_tag:
print("update")
result = subprocess.run(['git', 'checkout', latest_tag], stdout=subprocess.PIPE)
update(recursive=True)
else:
print("no update nessesary")
result = subprocess.run(['systemctl', 'restart', 'hardware-monitor.service'], stdout=subprocess.PIPE)
if __name__ == "__main__":
update()

140
packages.py Normal file
View File

@@ -0,0 +1,140 @@
import re
import subprocess
from time import sleep
def _getDist():
with open("/etc/os-release", "r") as f:
release = f.read().split('\n')
for line in release:
try:
s = line.split("=")
if s[0] == "ID":
os_id = s[1]
break
except:
pass
return os_id
def _isProxmox():
ispve = {}
ispve["ispve"] = False
try:
result = subprocess.run(['pveversion'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
ispve["ispve"] = True
ispve["string"] = result
except FileNotFoundError:
pass
return ispve
def _sizeMultiplier(unit):
if unit == "KiB":
return 1
elif unit == "MiB":
return 1024
elif unit == "GiB":
return 1024**2
elif unit == "TiB":
return 1024**3
else:
return 0
def getPackages():
dist = _getDist()
ispve = _isProxmox()
update_cmd = "apt update"
pkgs = {}
pkgs["updatable"] = {}
pkgs["installed"] = {}
if dist != "debian":
if dist == "arch":
update_cmd = "pacman -Sup --print-format %r,%n,%v"
if "apt" in update_cmd:
pkgs["package_mgr"] = "apt"
result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE)
update_cmd = "apt list --upgradable"
result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[11:-1].split("\n")
if result[0] == '':
result = []
for pkg in result:
try:
pkgs["updatable"][pkg.split(" ")[0].split("/")[0]] = {}
pkgs["updatable"][pkg.split(" ")[0].split("/")[0]]["version"] = pkg.split(" ")[1]
pkgs["updatable"][pkg.split(" ")[0].split("/")[0]]["repository"] = pkg.split(" ")[0].split("/")[1]
except:
pass
result = subprocess.run(["apt", "list"], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[11:-1].split("\n")
for pkg in result:
pkgs["installed"][pkg.split(" ")[0].split("/")[0]] = {}
pkgs["installed"][pkg.split(" ")[0].split("/")[0]]["version"] = pkg.split(" ")[1]
pkgs["installed"][pkg.split(" ")[0].split("/")[0]]["repository"] = pkg.split(" ")[0].split("/")[1]
result = subprocess.run(["dpkg-query","-W","--showformat='${Package} ${Installed-Size}\n'"], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-2].replace("'", "").split("\n")
for pkg in result:
try:
size = float(pkg.split(" ")[1])
except:
size = 0.0
pkgs["installed"][pkg.split(" ")[0]]["size"] = size
insld = {}
for pkg in pkgs["installed"]:
if "size" in pkgs["installed"][pkg]:
insld[pkg] = pkgs["installed"][pkg]
pkgs["installed"] = insld
if "pacman" in update_cmd:
pkgs["package_mgr"] = "pacman"
result = subprocess.run(update_cmd.split(" "), stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split("\n")
for pkg in result:
pkgs["updatable"][pkg.split(",")[1]] = {}
pkgs["updatable"][pkg.split(",")[1]]["version"] = pkg.split(",")[2]
pkgs["updatable"][pkg.split(",")[1]]["repository"] = pkg.split(",")[0]
result = subprocess.run(["pacman", "-Qn"], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split("\n")
for pkg in result:
pkgs["installed"][pkg.split(" ")[0]] = {}
pkgs["installed"][pkg.split(" ")[0]]["version"] = pkg.split(" ")[1]
pkgs["installed"][pkg.split(" ")[0]]["repository"] = "pacman.conf"
result = subprocess.run(["pacman", "-Qm"], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split("\n")
for pkg in result:
pkgs["installed"][pkg.split(" ")[0]] = {}
pkgs["installed"][pkg.split(" ")[0]]["version"] = pkg.split(" ")[1]
pkgs["installed"][pkg.split(" ")[0]]["repository"] = "user/AUR"
result = subprocess.run(["pacman", "-Qi"], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')[:-1].split("\n\n")
for pkg in result:
p = pkg.split("\n")
for i in p:
if "Name" in i:
package = i.split(": ")[1]
if "Installed Size" in i:
size = i.split(": ")[1]
try:
pkgs["installed"][package]["size"] = float(size.split(" ")[0]) * _sizeMultiplier(size.split(" ")[1])
except:
pass
total_size = 0
for pkg in pkgs["installed"]:
try:
total_size += pkgs["installed"][pkg]["size"]
except:
pass
pkgs["total_size"] = total_size
return pkgs
if __name__ == "__main__":
#print(_getDist())
#print(_isProxmox())
print(getPackages())
print(len(getPackages()['installed']))

128
procMon.py Normal file
View File

@@ -0,0 +1,128 @@
import promMon as pm
import os
import subprocess
from time import sleep
result = subprocess.run(['getconf', 'CLK_TCK'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
tikspersec = int(result)
def getPIDs():
pids = []
for i in os.listdir("/proc"):
if i.isdigit():
pids.append(i)
return pids
def parseStatus(pid):
with open("/proc/"+str(pid)+"/status", "r") as f:
status = f.read().replace("\t","").split("\n")
status.pop(-1)
status_dict = {}
for item in status:
i = item.split(":")
try:
k = i[1].split(" ")
data = []
for j in k:
if j != "":
data.append(j)
status_dict[i[0]] = data[0]+" "+data[1]
except Exception as e:
pass
return status_dict
def parseMeminfo():
with open("/proc/meminfo", "r") as f:
meminfo = f.read().split("\n")
meminfo_dict = {}
for i in meminfo:
i = i.split(" ")
vals = []
for j in i:
if j != "":
vals.append(j.replace(":",""))
try:
meminfo_dict[vals[0]] = int(vals[1]) * byteMult(vals[2])
except:
pass
return meminfo_dict
def byteMult(value):
if value == "B":
return 1
elif value == "kB":
return 1000
elif value == "MB":
return 1000000
elif value == "GB":
return 1000000000
elif value == "TB":
return 1000000000000
def exportProcesses(last_proc_stat=0):
pids = getPIDs()
processes = {}
with open("/proc/uptime", "r") as f:
uptime = float(f.read().split(" ")[0])
for pid in pids:
try:
with open("/proc/"+pid+"/stat", "r") as f:
stat = f.read().replace("(", "").split(") ")
stat[1] = stat[1].replace('\n', "")
s0 = stat[0].split(" ", 1)
s1 = stat[1].split(" ")
stat = s0 + s1
processes[pid] = {}
processes[pid]["id"] = stat[0]
processes[pid]["comm"] = stat[1]
processes[pid]["virt"] = stat[22]
processes[pid]["cpu_tiks_user"] = int(stat[13])
processes[pid]["cpu_tiks_system"] = int(stat[14])
processes[pid]["cpu_secs"] = (int(stat[13]) + int(stat[14])) / tikspersec
processes[pid]["page_fault_major"] = stat[11]
processes[pid]["page_fault_minor"] = stat[9]
processes[pid]["starttime"] = int(stat[21]) / tikspersec
processes[pid]["threadcnt"] = stat[19]
processes[pid]["state"] = stat[2]
processes[pid]["parent_pid"] = stat[3]
processes[pid]["runtime_seconds"] = uptime - processes[pid]["starttime"]
processes[pid]["cpu_total"] = 100 * processes[pid]["cpu_secs"] / processes[pid]["runtime_seconds"]
processes[pid]["cpu"] = 0
if last_proc_stat != 0:
try:
cpu_secs = processes[pid]["cpu_secs"] - last_proc_stat[pid]["cpu_secs"]
sampletime = processes[pid]["runtime_seconds"] - last_proc_stat[pid]["runtime_seconds"]
processes[pid]["cpu"] = 100 * cpu_secs / sampletime
except:
pass
status = parseStatus(pid)
meminfo = parseMeminfo()
memory = status["VmData"].split(" ")
if memory[0] == "":
memory.pop(0)
processes[pid]["memory"] = int(memory[0]) * byteMult(memory[1])
processes[pid]["memory_percent"] = 100 * (int(memory[0]) * byteMult(memory[1])) / meminfo["MemTotal"]
except Exception as e:
# print(e)
pass
return processes
if __name__ == "__main__":
proc = exportProcesses()
for i in range(0,10):
proc = exportProcesses(proc)
for pid in proc:
print(proc[pid])
sleep(2)

58
promMon.py Normal file
View File

@@ -0,0 +1,58 @@
# Author: Antonin Kaplan
# Date: 2025-12-23
#
# Prometheus client library wrapper for easier usage
#
# On path through the deepest forest even the dimmest light shines on your path to enlightenment
from prometheus_client import start_http_server, Gauge, Counter
# create prometheus monitoring object
# @param name of the monitores app for example: hw-monitor
# @param port of the exported endpoint which can be scraped by prometheus
# @retval None
class prometheus:
def __init__(self, name="promMon", port=8000):
self.name = name
self.port = port
self.monitors = {}
start_http_server(self.port)
# create variable for monitoring and add it to dict
# @param type of variable (Gauge, Counter)
# @param name of monitored value
# @param unit of measurement of specified value
# @retval None
def add_monitor(self, name, unit, tags=["instance"], type="Gauge"):
if type == "Gauge":
self.monitors[name] = Gauge(self.name+"_"+name, unit, tags)
elif type == "Counter":
self.monitors[name] = Counter(self.name+"_"+name, unit, tags)
else:
pass
# delete all tags from monitoring variable (variable stays) useful when storing data in tags like SMART or processes
# @param name of monitored value
# retval none
def delete_monitor(self, name):
self.monitors[name].clear()
# monitor function for updating monitored values
# @param name of monitored value
# @param instance identificator eg. IP address
# @param value of monitored variable
# @retval None
def monitor(self, name, tags, value):
self.monitors[name].labels(*tags).set(value)
if __name__ == '__main__':
from time import sleep
pm = prometheus(port=9339, name="test")
pm.add_monitor("time", "s", tags=["ip"])
time = 0
while True:
pm.monitor("time", ("localhost",), time)
time += 1
sleep(1)

209
ryzenPower.py Normal file
View File

@@ -0,0 +1,209 @@
#!/usr/bin/env python3
# ryzen-power: measure AMD Ryzen CPU power consumption.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
# This program is a Python port of rapl-read-ryzen
# https://github.com/djselbeck/rapl-read-ryzen
import logging
import os.path
import argparse
from itertools import count
from struct import unpack
from time import sleep
from warnings import warn
from datetime import datetime
logger = logging.getLogger("ryzen-master")
class RyzenPower:
AMD_MSR_PWR_UNIT_OFFSET = 0xC0010299
AMD_MSR_CORE_ENERGY_OFFSET = 0xC001029A
AMD_MSR_PACKAGE_ENERGY_OFFSET = 0xC001029B
AMD_TIME_UNIT_MASK = 0xF0000
AMD_ENERGY_UNIT_MASK = 0x1F00
AMD_POWER_UNIT_MASK = 0xF
def __init__(self, duration=1.0):
self._energy_unit = self._get_energy_units()
self._is_smt = self._detect_smt()
self._package_topology = self._detect_physical_package_topology()
self._duration = duration
self._cores = list(self._package_topology.keys())
if self._is_smt:
self._cores = [c for c in self._cores if c % 2 == 0]
self._cores = sorted(self._cores)
self._msr_fd_cache = {}
self.timestamp = 0
self.package_energy = -1
self.core_energy = -1
@staticmethod
def _read(filename):
with open(filename, "r") as f:
return f.read()
def _detect_smt(self):
try:
smt_status = self._read("/sys/devices/system/cpu/smt/control").strip()
logger.debug("CPU smt status is {}".format(smt_status))
return smt_status == "on"
except FileNotFoundError:
warn("unable to detect CPU SMT status, assume SMT is on")
return True
@staticmethod
def _detect_physical_package_topology():
cpu_package_mapping = {}
for cpu_id in count():
filename = "/sys/devices/system/cpu/cpu{}/topology/physical_package_id".format(cpu_id)
if os.path.isfile(filename):
with open(filename, "r") as f:
package_id = int(f.read())
logger.debug("detected cpu {} in socket {}".format(cpu_id, package_id))
cpu_package_mapping[cpu_id] = package_id
else:
return cpu_package_mapping
def _read_msr(self, cpu_id, offset):
msr_file = "/dev/cpu/{}/msr".format(cpu_id)
try:
with open(msr_file, "rb", buffering=8192) as f:
f.seek(offset)
# MSR value is always 64 bits
# https://manpages.debian.org/buster/manpages/msr.4.en.html
return self._decode_int64(f.read(8))
except PermissionError:
raise PermissionError("root privilege is required to read model-specific registers")
except FileNotFoundError:
raise FileNotFoundError("msr driver is not loaded, try \"sudo modprobe msr\" to load msr module")
@staticmethod
def _decode_int64(buffer):
return unpack("q", buffer)[0]
def _read_all_units(self):
return self._read_msr(0, self.AMD_MSR_PWR_UNIT_OFFSET)
def _get_energy_units(self):
energy_unit = (self._read_all_units() & self.AMD_ENERGY_UNIT_MASK) >> 8
logger.debug("CPU energy unit is 1/2^{}".format(energy_unit))
energy_unit = 0.5 ** energy_unit
return energy_unit
def _read_package_energy(self, cpu_id):
energy = self._read_msr(cpu_id, self.AMD_MSR_PACKAGE_ENERGY_OFFSET)
logger.debug("CPU {} current package energy {} J".format(cpu_id, energy, self._energy_unit))
return energy
def _read_core_energy(self, cpu_id):
energy = self._read_msr(cpu_id, self.AMD_MSR_CORE_ENERGY_OFFSET)
logger.debug("CPU {} current core energy {} * {} J".format(cpu_id, energy, self._energy_unit))
return energy
def _calc_power(self, before, after):
return (after - before) * self._energy_unit / self._duration
def _calc_power_wtime(self, before, after, duration):
return (after - before) * self._energy_unit / duration
def measure(self):
package_energy_before = {c: self._read_package_energy(c) for c in self._cores}
core_energy_before = {c: self._read_core_energy(c) for c in self._cores}
logger.debug("sleep for {} seconds".format(self._duration))
sleep(self._duration)
package_energy_after = {c: self._read_package_energy(c) for c in self._cores}
core_energy_after = {c: self._read_core_energy(c) for c in self._cores}
package_power = {c: self._calc_power(package_energy_before[c], package_energy_after[c]) for c in self._cores}
core_power = {c: self._calc_power(core_energy_before[c], core_energy_after[c]) for c in self._cores}
return package_power , core_power
#print(self._format_result(package_power, core_power))
def measure_nonblocking(self):
timestamp = datetime.now()
package_energy = {c: self._read_package_energy(c) for c in self._cores}
core_energy = {c: self._read_core_energy(c) for c in self._cores}
if self.package_energy != -1:
time_delta = (timestamp - self.timestamp).total_seconds()
package_power = {c: self._calc_power_wtime(self.package_energy[c], package_energy[c], time_delta) for c in self._cores}
core_power = {c: self._calc_power_wtime(self.core_energy[c], core_energy[c], time_delta) for c in self._cores}
else:
for c in self._cores:
package_power = {c: 0 for c in self._cores}
core_power = {c: 0 for c in self._cores}
self.package_energy = package_energy
self.core_energy = core_energy
self.timestamp = timestamp
avg_pp = 0
for c in self._cores:
avg_pp += package_power[0]
package_power = avg_pp / len(self._cores)
return package_power, core_power
@staticmethod
def _format_table(table, widths, units):
buffer = []
for row in table:
row_buffer = []
for col, width, unit in zip(row, widths, units):
if isinstance(col, float):
row_buffer.append("{:.2f}{}".format(col, unit).ljust(width))
else:
row_buffer.append(str(col).ljust(width))
buffer.append("".join(row_buffer))
return "\n".join(buffer)
def _format_result(self, package_power, core_power):
sockets = sorted(set(self._package_topology.values()))
table = [["", "Cores Power", "Package Power"]]
for socket in sockets:
socket_total_cores_power = 0
socket_package_power = 0
socket_power_entry = ["SOCKET {: 2}:".format(socket)]
table.append(socket_power_entry)
for core in self._cores:
if self._package_topology[core] == socket:
socket_total_cores_power += core_power[core]
socket_package_power = package_power[core]
table.append([
" CORE {: 2}:".format(core // 2 if self._is_smt else core),
core_power[core],
""
])
socket_power_entry.append(socket_total_cores_power)
socket_power_entry.append(socket_package_power)
return self._format_table(table, (16, 16, 16), ("", "W", "W"))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Measure power consumption for AMD Ryzen CPU')
parser.add_argument("--debug", action='store_true', help="show debug messages")
parser.add_argument("-d", "--duration", type=float, default=0.5,
help="the duration of measurement in seconds, default is 0.5 second")
args = parser.parse_args()
if args.debug:
stream_handler = logging.StreamHandler()
logger.addHandler(stream_handler)
logger.setLevel(logging.DEBUG)
package, core = RyzenPower(args.duration).measure()
print(RyzenPower(args.duration)._format_result(package, core))

65
sensors.py Normal file
View File

@@ -0,0 +1,65 @@
import subprocess
import json
def getSensors():
result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
devices = json.loads(result)
return devices
def getTemps():
devices = getSensors()
temps = {}
for device in devices:
if "nct" in device:
continue
temps[device] = {}
for sensor in devices[device]:
for data in devices[device][sensor]:
if "temp" in data:
if "input" in data:
temps[device][sensor] = devices[device][sensor][data]
return temps
def getVoltages():
devices = getSensors()
voltages = {}
for device in devices:
if "nct" not in device:
continue
voltages[device] = {}
for sensor in devices[device]:
if "in" in sensor:
for data in devices[device][sensor]:
if "input" in data:
voltages[device][sensor] = devices[device][sensor][data]
return voltages
def getFans():
devices = getSensors()
fans = {}
for device in devices:
if "nct" not in device:
continue
fans[device] = {}
for sensor in devices[device]:
if "fan" in sensor:
for data in devices[device][sensor]:
if "input" in data:
fans[device][sensor] = devices[device][sensor][data]
return fans
if __name__ == "__main__":
print(getTemps())
print("")
print(getVoltages())
print("")
print(getFans())

244
sysinfo.py Normal file
View File

@@ -0,0 +1,244 @@
import psutil
import platform
from datetime import datetime
import subprocess
import json
import socket
def getBoard():
result = subprocess.run(['dmidecode', '-t', '1'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8').replace("\t","").split("\n")
for item in result:
if "Product Name" in item:
return item.split(":")[1][1:]
return ""
def getSysInfo():
uname = platform.uname()
info = {}
info["hostname"] = uname.node
info["kernel"] = uname.release
info["version"] = uname.version
info["board"] = getBoard()
return info
def getDisk():
disk_io = psutil.disk_io_counters(perdisk=True)
disk_io_dict = {}
for disk in disk_io:
read_data = False
if "nvme" in disk:
if "p" not in disk:
read_data = True
if not any(char.isdigit() for char in disk):
read_data = True
if read_data:
data = {}
data["read"] = disk_io[disk].read_bytes
data["write"] = disk_io[disk].write_bytes
data["io_read"] = disk_io[disk].read_count
data["io_write"] = disk_io[disk].write_count
data["io_read_time"] = disk_io[disk].read_time
data["io_write_time"] = disk_io[disk].write_time
data["io_read_merged"] = disk_io[disk].read_merged_count
data["io_write_merged"] = disk_io[disk].write_merged_count
data["busy"] = disk_io[disk].busy_time
disk_io_dict[disk] = data
return disk_io_dict
def getCPU():
cpu_dict = {}
cpu_dict["time_percent"] = {}
cpu_dict["frequency"] = {}
cpu_time = psutil.cpu_times_percent()
freq = psutil.cpu_freq(percpu=True)
core_index = 0
for cpu in freq:
cpu_dict["frequency"][core_index] = cpu.current
core_index += 1
cpu_dict["usage"] = psutil.cpu_percent()
cpu_dict["cpu_count"] = psutil.cpu_count()
cpu_dict["time_percent"]["user"] = cpu_time.user
cpu_dict["time_percent"]["nice"] = cpu_time.nice
cpu_dict["time_percent"]["system"] = cpu_time.system
cpu_dict["time_percent"]["idle"] = cpu_time.idle
cpu_dict["time_percent"]["iowait"] = cpu_time.iowait
cpu_dict["time_percent"]["irq"] = cpu_time.irq
cpu_dict["time_percent"]["softirq"] = cpu_time.softirq
cpu_dict["time_percent"]["steal"] = cpu_time.steal
cpu_dict["time_percent"]["guest"] = cpu_time.guest
cpu_dict["time_percent"]["guest_nice"] = cpu_time.guest_nice
return cpu_dict
def getMemory():
mem_dict = {}
mem = psutil.virtual_memory()
swap = psutil.swap_memory()
mem_dict["total"] = mem.total
mem_dict["available"] = mem.available
mem_dict["percent"] = mem.percent
mem_dict["used"] = mem.used
mem_dict["free"] = mem.free
mem_dict["active"] = mem.active
mem_dict["inactive"] = mem.inactive
mem_dict["buffers"] = mem.buffers
mem_dict["cached"] = mem.cached
mem_dict["shared"] = mem.shared
mem_dict["slab"] = mem.slab
mem_dict["swap_total"] = swap.total
mem_dict["swap_used"] = swap.used
mem_dict["swap_free"] = swap.free
mem_dict["swap_percent"] = swap.percent
mem_dict["swap_in"] = swap.sin
mem_dict["swap_out"] = swap.sout
return mem_dict
def getPartitions():
part_dict = {}
partitions = psutil.disk_partitions()
for part in partitions:
name = part.device.split('/')[-1]
if "loop" not in name:
part_dict[name] = {}
part_dict[name]["size"] = psutil.disk_usage(part.mountpoint).total
part_dict[name]["used"] = psutil.disk_usage(part.mountpoint).used
part_dict[name]["free"] = psutil.disk_usage(part.mountpoint).free
return part_dict
def getZFS():
zfs_dict = {}
result = subprocess.run(['zpool', 'list', '-jHp'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
zfs = json.loads(result)
for pool in zfs["pools"]:
zfs_dict[pool] = {}
match zfs["pools"][pool]["state"]:
case "ONLINE":
zfs_dict[pool]["state"] = 2
case "DEGRADED":
zfs_dict[pool]["state"] = 1
case _:
zfs_dict[pool]["state"] = 0
zfs_dict[pool]["size"] = int(zfs["pools"][pool]["properties"]["size"]["value"])
zfs_dict[pool]["used"] = int(zfs["pools"][pool]["properties"]["allocated"]["value"])
zfs_dict[pool]["free"] = int(zfs["pools"][pool]["properties"]["free"]["value"])
zfs_dict[pool]["fragmentation"] = int(zfs["pools"][pool]["properties"]["fragmentation"]["value"])
zfs_dict[pool]["dedup"] = float(zfs["pools"][pool]["properties"]["dedupratio"]["value"])
return zfs_dict
def getUptime():
boot = psutil.boot_time()
uptime = datetime.now().timestamp() - boot
return uptime
def getNetwork():
net_dict = {}
net = psutil.net_io_counters(pernic=True)
for nic in net:
if "fw" not in nic:
if "lo" not in nic:
if "br" not in nic:
net_dict[nic] = {}
net_dict[nic]["rx"] = net[nic].bytes_recv
net_dict[nic]["tx"] = net[nic].bytes_sent
net_dict[nic]["err_rx"] = net[nic].errin
net_dict[nic]["err_tx"] = net[nic].errout
net_dict[nic]["drop_rx"] = net[nic].dropin
net_dict[nic]["drop_tx"] = net[nic].dropout
net_dict[nic]["packet_tx"] = net[nic].packets_sent
net_dict[nic]["packet_rx"] = net[nic].packets_recv
return net_dict
def getIP():
addresses = psutil.net_if_addrs()
addr = {}
for interface in addresses:
for type in addresses[interface]:
if type.family == socket.AF_INET:
addr[interface] = type.address
return addr
def users():
result = subprocess.run(['w'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8').split('\n')
if "FROM" not in result[1]:
result = subprocess.run(['w', '-f'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8').split('\n')
result.pop(-1)
header = []
users = {}
if len(result) > 2:
for i in range(1,len(result)):
a = result[i].split(" ")
user = []
for item in a:
if item != "":
if i == 1:
header.append(item)
else:
user.append(item)
active_user = ""
for item in range(0,len(header)):
try:
if "USER" in header[item]:
if user[item] in users:
users[user[item]]["sessions"] = users[user[item]]["sessions"] + 1
else:
users[user[item]] = {}
users[user[item]]["sessions"] = 1
users[user[item]]["session"] = {}
active_user = user[item]
if "FROM" in header[item]:
users[active_user]["session"][users[active_user]["sessions"]] = {}
if user[item].count(".") == 3:
users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item]
else:
if user[item] == "-":
users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item]
else:
users[active_user]["session"][users[active_user]["sessions"]]["from"] = user[item-1]
except:
pass
return users
if __name__ == "__main__":
# print(getSysInfo())
print(getDisk())
# print(users())
# print(getCPU())
# print(getMemory())
# print(getZFS())
print(getPartitions())
# print(getUptime())
# print(getNetwork())

27
temps.py Normal file
View File

@@ -0,0 +1,27 @@
import subprocess
import json
def getSensors():
result = subprocess.run(['sensors', '-j', '-A'], stdout=subprocess.PIPE)
result = result.stdout.decode('utf-8')
print(result)
devices = json.loads(result)
return devices
def getTemps():
devices = getSensors()
temps = {}
for device in devices:
temps[device] = {}
for sensor in devices[device]:
for data in devices[device][sensor]:
if "temp" in data:
if "input" in data:
temps[device][sensor] = devices[device][sensor][data]
return temps
if __name__ == "__main__":
getTemps()