From 7d2dbf3722abf82a3e63b311bcba3346f08bb7c9 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 23 Aug 2024 16:22:29 +0530 Subject: [PATCH 01/16] Added changes to capture KVM node and vm stats --- .gitignore | 1 + config/modules_config.json | 3 +- connection.py | 29 +++++++ main.py | 35 ++------- modules/kvm_monitor.py | 155 +++++++++++++++++++++++++++++++++++++ 5 files changed, 195 insertions(+), 28 deletions(-) create mode 100644 connection.py create mode 100644 modules/kvm_monitor.py diff --git a/.gitignore b/.gitignore index f0606fe..2f466f1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ env/ __pycache__/ *.py[cod] *$py.class +.idea/ \ No newline at end of file diff --git a/config/modules_config.json b/config/modules_config.json index b21ea72..b4c2c98 100644 --- a/config/modules_config.json +++ b/config/modules_config.json @@ -3,6 +3,7 @@ {"name": "uptime", "interval_seconds": 600}, {"name": "disk", "interval_seconds": 600}, {"name": "partition", "interval_seconds": 3600}, - {"name": "sensors", "interval_seconds": 60} + {"name": "sensors", "interval_seconds": 60}, + {"name": "kvm_monitor", "interval_seconds": 5} ] } diff --git a/connection.py b/connection.py new file mode 100644 index 0000000..fb7bf01 --- /dev/null +++ b/connection.py @@ -0,0 +1,29 @@ +import os +import influxdb_client + +from dotenv import load_dotenv +from influxdb_client.client.write_api import SYNCHRONOUS + +load_dotenv() + +INFLUX_URL = os.getenv("INFLUX_URL") +INFLUX_TOKEN = os.getenv("INFLUX_TOKEN") +INFLUX_ORG = os.getenv("INFLUX_ORG") +INFLUX_BUCKET = os.getenv("INFLUX_BUCKET") + +client = influxdb_client.InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG) +write_api = client.write_api(write_options=SYNCHRONOUS) + + +def create_influxdb_point(measurement, data): + point = influxdb_client.Point(measurement) + for key, value in data.items(): + if key == 'host': + # hotfix for dot in hostname + point = point.tag(key, value.split('.')[0]) + if key == 'vm_name': + point = point.tag(key, value) + else: + point = point.field(key, value) + + return point diff --git a/main.py b/main.py index 254daec..49725cc 100644 --- a/main.py +++ b/main.py @@ -3,30 +3,9 @@ import json import importlib import schedule -from dotenv import load_dotenv -import influxdb_client -from influxdb_client.client.write_api import SYNCHRONOUS -load_dotenv() +from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG -influx_url = os.getenv("INFLUX_URL") -influx_token = os.getenv("INFLUX_TOKEN") -influx_org = os.getenv("INFLUX_ORG") -influx_bucket = os.getenv("INFLUX_BUCKET") - -client = influxdb_client.InfluxDBClient(url=influx_url, token=influx_token, org=influx_org) -write_api = client.write_api(write_options=SYNCHRONOUS) - -def create_influxdb_point(measurement, data): - point = influxdb_client.Point(measurement) - for key, value in data.items(): - if key == 'host': - # hotfix for dot in hostname - point = point.tag(key, value.split('.')[0]) - else: - point = point.field(key, value) - - return point def load_config(): with open('config/modules_config.json', 'r') as f: @@ -38,19 +17,23 @@ def run_module(module_name): module = importlib.import_module(f"modules.{module_name}") data = module.collect_data() + if module_name == 'kvm_monitor': + return + if isinstance(data, list): # If collect data return multiple records for record in data: point = create_influxdb_point(module_name, record) - write_api.write(bucket=influx_bucket, org=influx_org, record=point) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) print(f"writing record for {module_name} finished.") else: point = create_influxdb_point(module_name, data) - write_api.write(bucket=influx_bucket, org=influx_org, record=point) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) print(f"writing record for {module_name} finished.") -def main(): + +if __name__ == '__main__': modules_config = load_config() for module_config in modules_config: module_name = module_config["name"] @@ -62,5 +45,3 @@ def main(): while True: schedule.run_pending() time.sleep(1) - -main() diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py new file mode 100644 index 0000000..cf6feba --- /dev/null +++ b/modules/kvm_monitor.py @@ -0,0 +1,155 @@ +import os +import subprocess +from threading import Thread +import inotify.adapters + +import ujson + +from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG + + +def get_vms_with_state(): + try: + output_lines = subprocess.check_output(['virsh', 'list', '--all']).decode('utf-8').strip().split('\n') + vm_stats = [] + keys = output_lines[0].split(' ') + keys = [key.lower().strip() for key in keys] + for line in output_lines[2:]: + values = line.split(' ') + vm_stat = dict() + vm_stat[keys[0]] = values[0].strip() + vm_stat[keys[1]] = values[1].strip() + vm_stat[keys[2]] = values[2].strip() + vm_stats.append(vm_stat) + return vm_stats + except Exception as e: + return [] + + +def get_kvm_stats(): + try: + output = subprocess.check_output( + ['sudo', '-S', 'kvmtop', '--cpu', '--mem', '--disk', '--net', '--io', '--host', '--verbose', + '--printer=json', '--runs=1'], input=f'{os.getenv("ROOT_PASS")}\n', text=True) + if output and output.startswith('{ "'): + data = ujson.loads(output) + return data + except Exception as e: + print(str(e)) + return {} + + +def sync_data_to_influx_db(data): + try: + point = create_influxdb_point('kvm_stats', data) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) + print(f"writing record for kvm_stats finished.") + except Exception as e: + return + + +def group_data_points(key_prefix, source_data): + return {key: source_data[key] for key in source_data if key.startswith(key_prefix)} + + +def filter_and_group_host_stats(hostname, host_uuid, data): + try: + to_return = {} + host_key_groups = {"cpu_": "cpustat", "ram_": "memory", "disk_": "disk", "net_": "nics", "psi_": "psistat"} + for key in host_key_groups: + data_group = group_data_points(key_prefix=key, source_data=data) + if data_group: + data_group.update({"host": hostname, "host_uuid": host_uuid}) + to_return[host_key_groups[key]] = data_group + return to_return + except Exception as e: + print(e) + return {} + + +def filter_and_group_vm_stats(hostname, host_uuid, data): + try: + to_return = {} + vm_name = data.get('name', "") + vm_id = data.get('UUID', "") + host_key_groups = {"cpu_": "cpustat", "ram_": "memory", "disk_": "disk", "net_": "nics", "io_": "iostat", } + for key in host_key_groups.keys(): + data_group = group_data_points(key_prefix=key, source_data=data) + if key == 'cpu_': + data_group.update({'state': data.get('state')}) + if data_group: + data_group.update({"host": hostname, "host_uuid": host_uuid, "vm_name": vm_name, "vm_id": vm_id}) + to_return[f"vm_{host_key_groups[key]}"] = data_group + return to_return + except Exception as e: + print(e) + return {} + + +def send_data_to_influxdb(data): + for key, value in data.items(): + if value: + point = create_influxdb_point(key, value) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) + print(f"writing record for {key} finished.") + else: + print('No data found') + + +def send_data(log): + try: + vms_list = get_vms_with_state() + domains = log.get('domains', []) + if not domains: + for vm in vms_list: + vm_stat = dict() + vm_stat['name'] = vm.get('name') + vm_stat['state'] = vm.get('state') + domains.append(vm_stat) + else: + for domain in domains: + domain.update({'state': 'running'}) + hostname = log.get("host", {}).get("host_name") + host_uuid = log.get("host", {}).get("host_uuid") + host_data = filter_and_group_host_stats(hostname, host_uuid, data=log.get('host')) + send_data_to_influxdb(host_data) + for vm_stats in domains: + if vm_stats.get('state') == 'running': + send_data_to_influxdb(filter_and_group_vm_stats(hostname, host_uuid, data=vm_stats)) + return True + except Exception as e: + print(str(e)) + return + + +def collect_data_continuously(): + log_file = "/home/vignesh/dev/kvmtop.logs" + i = inotify.adapters.Inotify() + i.add_watch(log_file) + + try: + for event in i.event_gen(yield_nones=False): + (_, type_names, path, filename) = event + + if "IN_MODIFY" in type_names: + with open(log_file, 'r') as file: + lines = file.readlines() + last_line = lines[-1].strip() + t1 = Thread(target=send_data, args=(last_line,)) + t1.run() + + finally: + i.remove_watch(log_file) + + +def collect_data(): + try: + data = get_kvm_stats() + return send_data(data) + except Exception as e: + print(e) + return False + + +if __name__ == "__main__": + collect_data() From 0f658d06e0385f925ab5805acff72ae86199eabb Mon Sep 17 00:00:00 2001 From: Vignesh Date: Mon, 26 Aug 2024 23:55:19 +0530 Subject: [PATCH 02/16] - Handled the exceptions with all modules. - Integrated libvirt to capture VM metrics --- main.py | 20 ++--- modules/disk.py | 21 +++-- modules/kvm_monitor.py | 179 ++++++++++++++++++++++++++++++++++++++++- modules/partition.py | 8 +- modules/sensors.py | 7 +- modules/uptime.py | 17 ++-- 6 files changed, 223 insertions(+), 29 deletions(-) diff --git a/main.py b/main.py index 49725cc..0640876 100644 --- a/main.py +++ b/main.py @@ -19,19 +19,19 @@ def run_module(module_name): if module_name == 'kvm_monitor': return - - if isinstance(data, list): - # If collect data return multiple records - for record in data: - point = create_influxdb_point(module_name, record) + if data: + if isinstance(data, list): + # If collect data return multiple records + for record in data: + point = create_influxdb_point(module_name, record) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) + print(f"writing record for {module_name} finished.") + + else: + point = create_influxdb_point(module_name, data) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) print(f"writing record for {module_name} finished.") - else: - point = create_influxdb_point(module_name, data) - write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - print(f"writing record for {module_name} finished.") - if __name__ == '__main__': modules_config = load_config() diff --git a/modules/disk.py b/modules/disk.py index 3ffec00..f55cce5 100644 --- a/modules/disk.py +++ b/modules/disk.py @@ -57,17 +57,22 @@ def get_smartctl_data(disk_path): "media_and_data_integrity_errors": int(media_and_data_integrity_errors) } + def collect_data(): - if os.getenv("STORAGE_SERVER"): - disks = get_nvme_disk_names() - disk_data = [] - for disk in disks: - disk_data.append(get_smartctl_data(f"/dev/{disk}")) + try: + if os.getenv("STORAGE_SERVER"): + disks = get_nvme_disk_names() + disk_data = [] + for disk in disks: + disk_data.append(get_smartctl_data(f"/dev/{disk}")) - return disk_data + return disk_data - disk_path = os.getenv("DISK_PATH") - return get_smartctl_data(disk_path) + disk_path = os.getenv("DISK_PATH") + return get_smartctl_data(disk_path) + except Exception as e: + print(str(e)) + return {} if __name__=="__main__": diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index cf6feba..14eb717 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -1,13 +1,28 @@ import os import subprocess from threading import Thread +from xml.etree import ElementTree + import inotify.adapters +import libvirt import ujson +import time from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG +VM_STATE_DEFINITION = { + libvirt.VIR_DOMAIN_NOSTATE: "no_state", + libvirt.VIR_DOMAIN_RUNNING: "running", + libvirt.VIR_DOMAIN_BLOCKED: "blocked", + libvirt.VIR_DOMAIN_PAUSED: "paused", + libvirt.VIR_DOMAIN_SHUTDOWN: "shutdown", + libvirt.VIR_DOMAIN_SHUTOFF: "shutoff", + libvirt.VIR_DOMAIN_CRASHED: "crashed", + libvirt.VIR_DOMAIN_PMSUSPENDED: "suspended_by_power_manager" +} + def get_vms_with_state(): try: output_lines = subprocess.check_output(['virsh', 'list', '--all']).decode('utf-8').strip().split('\n') @@ -29,7 +44,7 @@ def get_vms_with_state(): def get_kvm_stats(): try: output = subprocess.check_output( - ['sudo', '-S', 'kvmtop', '--cpu', '--mem', '--disk', '--net', '--io', '--host', '--verbose', + ['sudo', '-S', 'kvmtop', '--cpu', '--mem', '--disk', '--net', '--io', '--host', '--printer=json', '--runs=1'], input=f'{os.getenv("ROOT_PASS")}\n', text=True) if output and output.startswith('{ "'): data = ujson.loads(output) @@ -77,6 +92,9 @@ def filter_and_group_vm_stats(hostname, host_uuid, data): data_group = group_data_points(key_prefix=key, source_data=data) if key == 'cpu_': data_group.update({'state': data.get('state')}) + if host_key_groups[key] == "memory": + for k, v in data_group.items(): + data_group[k] = round(v / (1024 * 1024), 2) if data_group: data_group.update({"host": hostname, "host_uuid": host_uuid, "vm_name": vm_name, "vm_id": vm_id}) to_return[f"vm_{host_key_groups[key]}"] = data_group @@ -96,6 +114,27 @@ def send_data_to_influxdb(data): print('No data found') +def merge_lists_of_dicts(list1, list2, key): + # Create a dictionary to hold merged results + merged_dict = {} + + # Add dictionaries from the first list to the merged_dict + for item in list1: + merged_dict[item[key]] = item + + # Update the merged_dict with dictionaries from the second list + for item in list2: + if item[key] in merged_dict: + # If the key exists, merge the dictionaries + merged_dict[item[key]].update(item) + else: + # If the key does not exist, add the new item + merged_dict[item[key]] = item + + # Convert merged_dict back to a list + return list(merged_dict.values()) + + def send_data(log): try: vms_list = get_vms_with_state() @@ -109,11 +148,15 @@ def send_data(log): else: for domain in domains: domain.update({'state': 'running'}) + hostname = log.get("host", {}).get("host_name") host_uuid = log.get("host", {}).get("host_uuid") + host, vms = get_vms_and_host_stats() + log['host'].update(host) host_data = filter_and_group_host_stats(hostname, host_uuid, data=log.get('host')) send_data_to_influxdb(host_data) - for vm_stats in domains: + combined_vms = merge_lists_of_dicts(domains, vms, 'name') + for vm_stats in combined_vms: if vm_stats.get('state') == 'running': send_data_to_influxdb(filter_and_group_vm_stats(hostname, host_uuid, data=vm_stats)) return True @@ -142,6 +185,138 @@ def collect_data_continuously(): i.remove_watch(log_file) +def get_vm_last_known_cpu_time(vm_name): + try: + with open(f'./{vm_name}.dat', 'r') as f: + time_and_cpu_time = f.read() + if time_and_cpu_time: + timestamp, cpu_time = [float(data) for data in time_and_cpu_time.split(',')] + return timestamp, cpu_time + return time.time(), 0.0 + except FileNotFoundError: + return time.time(), 0.0 + + +def set_vm_last_known_cpu_time(vm_name, cpu_time, timestamp): + try: + with open(f'./{vm_name}.dat', 'w') as f: + return f.write(f"{timestamp}, {cpu_time}") + except FileNotFoundError: + return 0 + + +def get_cpu_usage_percentage(vm): + last_timestamp, prev_cpu_time = get_vm_last_known_cpu_time(vm.name()) + cpu_stats = vm.getCPUStats(True)[0] + user_time = cpu_stats['user_time'] / 1000000000 # Convert from nanoseconds to seconds + system_time = cpu_stats['system_time'] / 1000000000 + total_cpu_time = user_time + system_time + + # Calculate the CPU time used since the last measurement + cpu_time_used = total_cpu_time - prev_cpu_time + + # Get the number of virtual CPUs + v_cpus = vm.vcpus()[0] + no_v_cpus = len(v_cpus) + + # Calculate CPU usage percentage + current_time = time.time() + duration = round(current_time - last_timestamp) + cpu_usage_percentage = (cpu_time_used / (duration * no_v_cpus)) * 100 # 1 second interval + + set_vm_last_known_cpu_time(vm.name(), total_cpu_time, current_time) + + return cpu_usage_percentage + + +def get_vms_and_host_stats(): + conn = libvirt.open("qemu:///system") + try: + stats = conn.getInfo() + host_information = { + "cpu_model": stats[0], + "ram_total": stats[1]/1024, + "cpu_cores": stats[2], + "cpu_max_freq": stats[3], + "cpu_numa_nodes": stats[4], + "cpu_sockets_per_node": stats[5], + "cpu_cores_per_socket": stats[6], + "cpu_max_threads_per_core": stats[7] + } + vms = conn.listAllDomains() + vm_stats = [] + if vms: + for vm in vms: + state, max_mem, mem, no_of_cpu, cpu_time = vm.info() + stats = { + "cpu_cores": no_of_cpu, + "cpu_time": cpu_time / 1000000000, + "state": VM_STATE_DEFINITION.get(state), + "name": vm.name(), + "ram_max": max_mem, + "ram_actual": mem, + } + if state == libvirt.VIR_DOMAIN_RUNNING: + cpu_usage_percentage = get_cpu_usage_percentage(vm) + stats["cpu_usage"] = cpu_usage_percentage + mem_stat = vm.memoryStats() + for key, value in mem_stat.items(): + stats.update({ + f"ram_{key}": value + }) + + tree = ElementTree.fromstring(vm.XMLDesc()) + disks = [path.get('file', '') for path in tree.findall("devices/disk/source")] + total_read_bytes, total_write_bytes = 0, 0 + total_read_req, total_write_req, total_no_errors = 0, 0, 0 + for disk in disks: + (rd_req, rd_bytes, wr_req, wr_bytes, err) = vm.blockStats(disk) + # stats.update({ + # "disk_id": disk, + # 'disk_no_read_req': rd_req, + # 'disk_read_bytes': rd_bytes, + # 'disk_no_write_req': wr_req, + # 'disk_write_bytes': wr_bytes, + # 'disk_no_errors': err, + # }) + total_read_bytes += rd_bytes + total_write_bytes += wr_bytes + total_read_req += rd_req + total_write_req += wr_req + total_no_errors += total_no_errors + stats.update({ + 'io_read_bytes': total_read_bytes, + 'io_write_bytes': total_write_bytes, + 'io_read_req': total_read_req, + 'io_write_req': total_write_req, + 'io_no_errors': total_no_errors, + }) + + interface = tree.find("devices/interface/target").get("dev") + net_stats = vm.interfaceStats(interface) + stats.update({ + 'net_read_bytes': net_stats[0], + 'net_read_packets': net_stats[1], + 'net_read_errors': net_stats[2], + 'net_read_drops': net_stats[3], + 'net_write_bytes': net_stats[4], + 'net_write_packets': net_stats[5], + 'net_write_errors': net_stats[6], + 'net_write_drops': net_stats[7], + }) + + vm_stats.append(stats) + + return host_information, vm_stats + + except Exception as e: + print(str(e)) + finally: + conn.close() + + + + def collect_data(): try: data = get_kvm_stats() diff --git a/modules/partition.py b/modules/partition.py index 16a972e..96aceeb 100644 --- a/modules/partition.py +++ b/modules/partition.py @@ -44,8 +44,12 @@ def get_disk_usage_for_mount_points(mnt_directory='/mnt'): def collect_data(): - mnt_directory = '/mnt' - return get_disk_usage_for_mount_points(mnt_directory) + try: + mnt_directory = '/mnt' + return get_disk_usage_for_mount_points(mnt_directory) + except Exception as e: + print(str(e)) + return {} if __name__ == "__main__": diff --git a/modules/sensors.py b/modules/sensors.py index f8b769d..5d508c0 100644 --- a/modules/sensors.py +++ b/modules/sensors.py @@ -23,8 +23,13 @@ def get_sensor_data(): "cpu tccd2": sensor_data.get("k10temp-pci-00c3", {}).get("Tccd2", {}).get("temp4_input"), } + def collect_data(): - return get_sensor_data() + try: + return get_sensor_data() + except Exception as e: + print(str(e)) + return {} if __name__=="__main__": diff --git a/modules/uptime.py b/modules/uptime.py index 3629da0..b129b91 100644 --- a/modules/uptime.py +++ b/modules/uptime.py @@ -23,12 +23,17 @@ def get_system_uptime_seconds(): return None def collect_data(): - hostname = socket.gethostname() - uptime_seconds = get_system_uptime_seconds() - return { - "host": hostname, - "uptime": uptime_seconds - } + try: + hostname = socket.gethostname() + uptime_seconds = get_system_uptime_seconds() + return { + "host": hostname, + "uptime": uptime_seconds + } + except Exception as e: + print(str(e)) + return {} + if __name__=="__main__": print(collect_data()) From cf67b15bf89e862cb96ea4428a67152c023ac9a7 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Mon, 26 Aug 2024 23:55:46 +0530 Subject: [PATCH 03/16] - Handled the exceptions with all modules. - Integrated libvirt to capture VM metrics --- modules/kvm_monitor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index 14eb717..55059fd 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -74,6 +74,9 @@ def filter_and_group_host_stats(hostname, host_uuid, data): for key in host_key_groups: data_group = group_data_points(key_prefix=key, source_data=data) if data_group: + if host_key_groups[key] == "memory": + for k, v in data_group.items(): + data_group[k] = round(v / (1024*1024), 2) data_group.update({"host": hostname, "host_uuid": host_uuid}) to_return[host_key_groups[key]] = data_group return to_return From 0ed85072d9115b63d2baf777e94520515bb01b28 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Tue, 24 Sep 2024 18:15:10 +0530 Subject: [PATCH 04/16] - Handled the exceptions with disk status capturing. - Updated requirements.txt --- modules/disk.py | 29 ++++++++++++++++++++++++----- requirements.txt | 1 + 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/modules/disk.py b/modules/disk.py index f55cce5..7a143f7 100644 --- a/modules/disk.py +++ b/modules/disk.py @@ -1,7 +1,9 @@ import os import socket import subprocess -import json + +import ujson + def get_nvme_disk_names(): try: @@ -16,16 +18,33 @@ def get_nvme_disk_names(): return [] +def get_disk_stats(disk_path): + try: + output = subprocess.check_output( + args=['sudo', '-S', 'smartctl', '-j', '-a', disk_path], + input=f'{os.getenv("ROOT_PASS")}\n', + universal_newlines=True + ) + if output and output.startswith('{ "'): + data = ujson.loads(output) + return data + except subprocess.CalledProcessError as e: + print(f"Command failed with exit status {e.returncode}") + if e.output and e.output.startswith('{\n'): + data = ujson.loads(e.output) + return data + except Exception as e: + print(str(e)) + return {} + + def get_smartctl_data(disk_path): if disk_path is None: print("DISK_PATH environment variable is not set.") return None - command_output = subprocess.check_output(['smartctl', '-j', '-a', disk_path]).decode('utf-8') - - # Parse JSON output - smart_data = json.loads(command_output) + smart_data = get_disk_stats(disk_path) # Extract relevant values from the JSON data hostname = socket.gethostname() diff --git a/requirements.txt b/requirements.txt index 546dc0e..42031a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ schedule==1.2.1 six==1.16.0 typing_extensions==4.8.0 urllib3==2.1.0 +libvirt-python==10.7.0 From 3efb5f26db0dcf431638b5d47174f96ca0732583 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Tue, 24 Sep 2024 20:16:07 +0530 Subject: [PATCH 05/16] - Enabled Host Network I/O monitoring --- config/modules_config.json | 3 +- modules/kvm_monitor.py | 2 +- modules/network.py | 89 ++++++++++++++++++++++++++++---------- 3 files changed, 68 insertions(+), 26 deletions(-) diff --git a/config/modules_config.json b/config/modules_config.json index b4c2c98..a4a684d 100644 --- a/config/modules_config.json +++ b/config/modules_config.json @@ -4,6 +4,7 @@ {"name": "disk", "interval_seconds": 600}, {"name": "partition", "interval_seconds": 3600}, {"name": "sensors", "interval_seconds": 60}, - {"name": "kvm_monitor", "interval_seconds": 5} + {"name": "kvm_monitor", "interval_seconds": 5}, + {"name": "network", "interval_seconds": 5} ] } diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index 55059fd..ba0e58a 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -114,7 +114,7 @@ def send_data_to_influxdb(data): write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) print(f"writing record for {key} finished.") else: - print('No data found') + print('Uploaded all data points from kvm_monitor') def merge_lists_of_dicts(list1, list2, key): diff --git a/modules/network.py b/modules/network.py index d3cc65c..9119c67 100644 --- a/modules/network.py +++ b/modules/network.py @@ -1,33 +1,74 @@ -import psutil -from time import sleep +# import psutil +# from time import sleep +# +# def get_network_data(interface): +# stats_before = psutil.net_io_counters(pernic=True)[interface] +# sleep(5) # Adjust the sleep duration as needed +# stats_after = psutil.net_io_counters(pernic=True)[interface] +# +# receive_bytes = stats_after.bytes_recv - stats_before.bytes_recv +# transmit_bytes = stats_after.bytes_sent - stats_before.bytes_sent +# +# return receive_bytes, transmit_bytes +# +# def main(): +# network_interface = "enp6s0" +# interval_seconds = 5 +# +# try: +# while True: +# receive_bytes, transmit_bytes = get_network_data(network_interface) +# +# print(f"Network Interface: {network_interface}") +# print(f"Received Data: {receive_bytes} bytes") +# print(f"Transmitted Data: {transmit_bytes} bytes") +# print("=" * 30) +# +# sleep(interval_seconds) +# +# except KeyboardInterrupt: +# print("Script terminated by user.") +import socket +import subprocess -def get_network_data(interface): - stats_before = psutil.net_io_counters(pernic=True)[interface] - sleep(5) # Adjust the sleep duration as needed - stats_after = psutil.net_io_counters(pernic=True)[interface] - receive_bytes = stats_after.bytes_recv - stats_before.bytes_recv - transmit_bytes = stats_after.bytes_sent - stats_before.bytes_sent +def get_network_io() -> dict: + try: + # Read the contents of /proc/net/dev + result = subprocess.run(['cat', '/proc/net/dev'], capture_output=True, text=True, check=True) - return receive_bytes, transmit_bytes + # Split the output into lines + lines = result.stdout.splitlines() -def main(): - network_interface = "enp6s0" - interval_seconds = 5 + # Initialize a dictionary to hold network I/O stats + network_io = {} - try: - while True: - receive_bytes, transmit_bytes = get_network_data(network_interface) + # Process each line + for line in lines[2:]: # Skip the first two header lines + parts = line.split(':') + if len(parts) > 1: + interface = parts[0].strip() + stats = parts[1].strip().split() + # Assuming stats[0] is bytes received and stats[8] is bytes transmitted + bytes_received = int(stats[0]) + bytes_transmitted = int(stats[8]) + network_io.update({ + f'{interface}_received': bytes_received, + f'{interface}_transmitted': bytes_transmitted + }) - print(f"Network Interface: {network_interface}") - print(f"Received Data: {receive_bytes} bytes") - print(f"Transmitted Data: {transmit_bytes} bytes") - print("=" * 30) + return network_io - sleep(interval_seconds) + except subprocess.CalledProcessError as e: + print(f"Error occurred: {e}") + return {} - except KeyboardInterrupt: - print("Script terminated by user.") -if __name__ == "__main__": - main() +def collect_data(): + try: + net_stats = get_network_io() + net_stats.update({"host": socket.gethostname()}) + return net_stats + except Exception as e: + print(str(e)) + return {} From 88e7c94edac26088d67b30812e09ce427eafb2a6 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Wed, 25 Sep 2024 12:45:21 +0530 Subject: [PATCH 06/16] - Fix issue with the cpu usage capturing and Network IO usage of host machine. - Modify the logic to capture all data points in same interval and with global MONITORING_INTERVAL --- config/modules_config.json | 12 +++--- main.py | 5 ++- modules/__init__.py | 1 + modules/kvm_monitor.py | 5 ++- modules/network.py | 87 ++++++++------------------------------ 5 files changed, 32 insertions(+), 78 deletions(-) diff --git a/config/modules_config.json b/config/modules_config.json index a4a684d..2bc1a06 100644 --- a/config/modules_config.json +++ b/config/modules_config.json @@ -1,10 +1,10 @@ { "modules": [ - {"name": "uptime", "interval_seconds": 600}, - {"name": "disk", "interval_seconds": 600}, - {"name": "partition", "interval_seconds": 3600}, - {"name": "sensors", "interval_seconds": 60}, - {"name": "kvm_monitor", "interval_seconds": 5}, - {"name": "network", "interval_seconds": 5} + {"name": "uptime", "interval_seconds": 30}, + {"name": "disk", "interval_seconds": 30}, + {"name": "partition", "interval_seconds": 30}, + {"name": "sensors", "interval_seconds": 30}, + {"name": "kvm_monitor", "interval_seconds": 30}, + {"name": "network", "interval_seconds": 30} ] } diff --git a/main.py b/main.py index 0640876..2ecbcb5 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import schedule from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG +from modules import MONITORING_INTERVAL def load_config(): @@ -37,9 +38,9 @@ def run_module(module_name): modules_config = load_config() for module_config in modules_config: module_name = module_config["name"] - interval_seconds = module_config.get("interval_seconds", False) + # interval_seconds = module_config.get("interval_seconds", False) - schedule.every(interval_seconds).seconds.do( + schedule.every(MONITORING_INTERVAL).seconds.do( run_module, module_name=module_name) while True: diff --git a/modules/__init__.py b/modules/__init__.py index e69de29..901ce47 100644 --- a/modules/__init__.py +++ b/modules/__init__.py @@ -0,0 +1 @@ +MONITORING_INTERVAL = 30 \ No newline at end of file diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index ba0e58a..5b28ada 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -5,12 +5,13 @@ import inotify.adapters import libvirt +import psutil import ujson import time from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG - +from modules import MONITORING_INTERVAL VM_STATE_DEFINITION = { libvirt.VIR_DOMAIN_NOSTATE: "no_state", @@ -77,6 +78,8 @@ def filter_and_group_host_stats(hostname, host_uuid, data): if host_key_groups[key] == "memory": for k, v in data_group.items(): data_group[k] = round(v / (1024*1024), 2) + if host_key_groups[key] == "cpustat": + data_group['cpu_usage'] = psutil.cpu_percent(interval=MONITORING_INTERVAL) data_group.update({"host": hostname, "host_uuid": host_uuid}) to_return[host_key_groups[key]] = data_group return to_return diff --git a/modules/network.py b/modules/network.py index 9119c67..f7840fd 100644 --- a/modules/network.py +++ b/modules/network.py @@ -1,74 +1,23 @@ -# import psutil -# from time import sleep -# -# def get_network_data(interface): -# stats_before = psutil.net_io_counters(pernic=True)[interface] -# sleep(5) # Adjust the sleep duration as needed -# stats_after = psutil.net_io_counters(pernic=True)[interface] -# -# receive_bytes = stats_after.bytes_recv - stats_before.bytes_recv -# transmit_bytes = stats_after.bytes_sent - stats_before.bytes_sent -# -# return receive_bytes, transmit_bytes -# -# def main(): -# network_interface = "enp6s0" -# interval_seconds = 5 -# -# try: -# while True: -# receive_bytes, transmit_bytes = get_network_data(network_interface) -# -# print(f"Network Interface: {network_interface}") -# print(f"Received Data: {receive_bytes} bytes") -# print(f"Transmitted Data: {transmit_bytes} bytes") -# print("=" * 30) -# -# sleep(interval_seconds) -# -# except KeyboardInterrupt: -# print("Script terminated by user.") import socket -import subprocess +import time +import psutil -def get_network_io() -> dict: - try: - # Read the contents of /proc/net/dev - result = subprocess.run(['cat', '/proc/net/dev'], capture_output=True, text=True, check=True) - - # Split the output into lines - lines = result.stdout.splitlines() - - # Initialize a dictionary to hold network I/O stats - network_io = {} - - # Process each line - for line in lines[2:]: # Skip the first two header lines - parts = line.split(':') - if len(parts) > 1: - interface = parts[0].strip() - stats = parts[1].strip().split() - # Assuming stats[0] is bytes received and stats[8] is bytes transmitted - bytes_received = int(stats[0]) - bytes_transmitted = int(stats[8]) - network_io.update({ - f'{interface}_received': bytes_received, - f'{interface}_transmitted': bytes_transmitted - }) - - return network_io - - except subprocess.CalledProcessError as e: - print(f"Error occurred: {e}") - return {} - +io, last_captured_time = psutil.net_io_counters(pernic=True), time.time() def collect_data(): - try: - net_stats = get_network_io() - net_stats.update({"host": socket.gethostname()}) - return net_stats - except Exception as e: - print(str(e)) - return {} + global io, last_captured_time + io_2, recent_captured_time = psutil.net_io_counters(pernic=True), time.time() + delay = recent_captured_time - last_captured_time + data = {} + for iface, iface_io in io.items(): + upload_speed, download_speed = io_2[iface].bytes_sent - iface_io.bytes_sent, io_2[iface].bytes_recv - iface_io.bytes_recv + data.update({ + f"{iface}_download": io_2[iface].bytes_recv, + f"{iface}_upload": io_2[iface].bytes_sent, + f"{iface}_upload_speed": round(upload_speed / delay) , + f"{iface}_download_speed": round(download_speed / delay), + }) + io, last_captured_time = io_2, recent_captured_time + data.update({"host": socket.gethostname()}) + return data From 47f6eb38073a8a6c670c9e12ff7232675c71dda5 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Wed, 25 Sep 2024 17:49:26 +0530 Subject: [PATCH 07/16] - Fix issue with uptime parsing --- modules/uptime.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/modules/uptime.py b/modules/uptime.py index b129b91..55025c9 100644 --- a/modules/uptime.py +++ b/modules/uptime.py @@ -4,23 +4,14 @@ def get_system_uptime_seconds(): try: - uptime_output = subprocess.check_output(['uptime']).decode('utf-8').strip() - uptime_match = re.search(r"up(?:\s+)?((\d+) days?,)?(?:\s+)?(\d+):(\d+)", uptime_output) - - if uptime_match: - # Corrected line: Convert numeric day value to integer - days = int(uptime_match.group(2)) if uptime_match.group(2) else 0 - hours, minutes = map(int, uptime_match.group(3, 4)) - - uptime_seconds = days * 24 * 60 * 60 + hours * 60 * 60 + minutes * 60 - - return uptime_seconds - else: - raise ValueError("Unable to parse uptime information") + with open('/proc/uptime', 'r') as f: + # Read the first line from /proc/uptime + uptime_seconds = float(f.readline().split()[0]) + return int(uptime_seconds % 60) except Exception as e: - print(f"Error: {e}") - return None + print(f"Error reading uptime: {e}") + return 0 def collect_data(): try: From 30d30f44fc28ca7a3679f54362d6bb220343aa47 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Thu, 26 Sep 2024 18:36:09 +0530 Subject: [PATCH 08/16] - Fix issue with VM cpu usage calculation on the kvm_monitor.py and added proper traceback to all exceptions --- modules/kvm_monitor.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index 5b28ada..5b94e92 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -9,6 +9,7 @@ import ujson import time +import traceback from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG from modules import MONITORING_INTERVAL @@ -39,6 +40,7 @@ def get_vms_with_state(): vm_stats.append(vm_stat) return vm_stats except Exception as e: + print(traceback.format_exc()) return [] @@ -51,7 +53,7 @@ def get_kvm_stats(): data = ujson.loads(output) return data except Exception as e: - print(str(e)) + print(traceback.format_exc()) return {} @@ -61,6 +63,7 @@ def sync_data_to_influx_db(data): write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) print(f"writing record for kvm_stats finished.") except Exception as e: + print(traceback.format_exc()) return @@ -84,7 +87,7 @@ def filter_and_group_host_stats(hostname, host_uuid, data): to_return[host_key_groups[key]] = data_group return to_return except Exception as e: - print(e) + print(traceback.format_exc()) return {} @@ -106,7 +109,7 @@ def filter_and_group_vm_stats(hostname, host_uuid, data): to_return[f"vm_{host_key_groups[key]}"] = data_group return to_return except Exception as e: - print(e) + print(traceback.format_exc()) return {} @@ -167,7 +170,7 @@ def send_data(log): send_data_to_influxdb(filter_and_group_vm_stats(hostname, host_uuid, data=vm_stats)) return True except Exception as e: - print(str(e)) + print(traceback.format_exc()) return @@ -228,7 +231,10 @@ def get_cpu_usage_percentage(vm): # Calculate CPU usage percentage current_time = time.time() duration = round(current_time - last_timestamp) - cpu_usage_percentage = (cpu_time_used / (duration * no_v_cpus)) * 100 # 1 second interval + if duration and no_v_cpus: + cpu_usage_percentage = (cpu_time_used / (duration * no_v_cpus)) * 100 # 1 second interval + else: + cpu_usage_percentage = 0.0 set_vm_last_known_cpu_time(vm.name(), total_cpu_time, current_time) @@ -316,7 +322,7 @@ def get_vms_and_host_stats(): return host_information, vm_stats except Exception as e: - print(str(e)) + print(traceback.format_exc()) finally: conn.close() @@ -328,7 +334,7 @@ def collect_data(): data = get_kvm_stats() return send_data(data) except Exception as e: - print(e) + print(traceback.format_exc()) return False From 7a10fe7e2ab86edc781a0be37bf6a92a5bc9f887 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 27 Sep 2024 14:28:49 +0530 Subject: [PATCH 09/16] - Added log file and launch.sh script --- .gitignore | 5 ++++- README.md | 2 +- launch.sh | 32 +++++++++++++++++++++++++++++++ modules/__init__.py | 21 +++++++++++++++++++- modules/disk.py | 14 ++++++++------ modules/kvm_monitor.py | 24 +++++++++++------------ modules/network.py | 37 +++++++++++++++++++++--------------- modules/partition.py | 5 +++-- modules/sensors.py | 4 +++- modules/uptime.py | 8 ++++---- modules/vmstatus.py | 10 ++++++---- requirements.txt | 1 + main.py => system_monitor.py | 6 +++--- 13 files changed, 119 insertions(+), 50 deletions(-) create mode 100644 launch.sh rename main.py => system_monitor.py (87%) diff --git a/.gitignore b/.gitignore index 2f466f1..061bbc1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,7 @@ env/ __pycache__/ *.py[cod] *$py.class -.idea/ \ No newline at end of file +.idea/ +venv/ + +logs/ \ No newline at end of file diff --git a/README.md b/README.md index e29edda..bbd1b13 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,6 @@ Make sure you have the following installed on your system: 2. Run the script: ```bash - python main.py + python system_monitor.py ``` diff --git a/launch.sh b/launch.sh new file mode 100644 index 0000000..170ad88 --- /dev/null +++ b/launch.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Name of the process to check (without path) +PROCESS_NAME="system_monitor.py" + +# Directory of your Python virtual environment +VENV_DIR="/path/to/your/venv" + +# Directory where your Python script is located +SCRIPT_DIR="/path/to/your/script" + +# Name of your Python script +SCRIPT_NAME="your_script.py" + +# Check if the process is running +if ! pgrep -f "$PROCESS_NAME" > /dev/null +then + echo "Process $PROCESS_NAME is not running. Starting it..." + + # Activate virtual environment + source "$VENV_DIR/bin/activate" + + # Change directory to the script location + cd "$SCRIPT_DIR" + + # Launch the Python script + nohup python "$SCRIPT_NAME" & + + echo "Process $PROCESS_NAME started." +else + echo "Process $PROCESS_NAME is already running." +fi diff --git a/modules/__init__.py b/modules/__init__.py index 901ce47..061028e 100644 --- a/modules/__init__.py +++ b/modules/__init__.py @@ -1 +1,20 @@ -MONITORING_INTERVAL = 30 \ No newline at end of file +import logging +import os + +MONITORING_INTERVAL = 30 + +if not os.path.exists('./logs'): + os.makedirs('logs') + +# Configure logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.FileHandler("./logs/system_monitor.log"), + logging.StreamHandler() + ] +) + +# Create a logger instance +logger = logging.getLogger(__name__) \ No newline at end of file diff --git a/modules/disk.py b/modules/disk.py index 7a143f7..31fc2b7 100644 --- a/modules/disk.py +++ b/modules/disk.py @@ -4,6 +4,8 @@ import ujson +from modules import logger + def get_nvme_disk_names(): try: @@ -14,7 +16,7 @@ def get_nvme_disk_names(): return nvme_disks except subprocess.CalledProcessError as e: - print(f"Error: {e}") + logger.debug(f"Error: {e}") return [] @@ -29,19 +31,19 @@ def get_disk_stats(disk_path): data = ujson.loads(output) return data except subprocess.CalledProcessError as e: - print(f"Command failed with exit status {e.returncode}") + logger.debug(f"Command failed with exit status {e.returncode}") if e.output and e.output.startswith('{\n'): data = ujson.loads(e.output) return data except Exception as e: - print(str(e)) + logger.debug(str(e)) return {} def get_smartctl_data(disk_path): if disk_path is None: - print("DISK_PATH environment variable is not set.") + logger.debug("DISK_PATH environment variable is not set.") return None smart_data = get_disk_stats(disk_path) @@ -90,9 +92,9 @@ def collect_data(): disk_path = os.getenv("DISK_PATH") return get_smartctl_data(disk_path) except Exception as e: - print(str(e)) + logger.debug(str(e)) return {} if __name__=="__main__": - print(collect_data()) + logger.debug(collect_data()) diff --git a/modules/kvm_monitor.py b/modules/kvm_monitor.py index 5b94e92..7b6ea38 100644 --- a/modules/kvm_monitor.py +++ b/modules/kvm_monitor.py @@ -12,7 +12,7 @@ import traceback from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG -from modules import MONITORING_INTERVAL +from modules import MONITORING_INTERVAL, logger VM_STATE_DEFINITION = { libvirt.VIR_DOMAIN_NOSTATE: "no_state", @@ -40,7 +40,7 @@ def get_vms_with_state(): vm_stats.append(vm_stat) return vm_stats except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return [] @@ -53,7 +53,7 @@ def get_kvm_stats(): data = ujson.loads(output) return data except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return {} @@ -61,9 +61,9 @@ def sync_data_to_influx_db(data): try: point = create_influxdb_point('kvm_stats', data) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - print(f"writing record for kvm_stats finished.") + logger.debug(f"writing record for kvm_stats finished.") except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return @@ -87,7 +87,7 @@ def filter_and_group_host_stats(hostname, host_uuid, data): to_return[host_key_groups[key]] = data_group return to_return except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return {} @@ -109,7 +109,7 @@ def filter_and_group_vm_stats(hostname, host_uuid, data): to_return[f"vm_{host_key_groups[key]}"] = data_group return to_return except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return {} @@ -118,9 +118,9 @@ def send_data_to_influxdb(data): if value: point = create_influxdb_point(key, value) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - print(f"writing record for {key} finished.") + logger.debug(f"writing record for {key} finished.") else: - print('Uploaded all data points from kvm_monitor') + logger.debug('Uploaded all data points from kvm_monitor') def merge_lists_of_dicts(list1, list2, key): @@ -170,7 +170,7 @@ def send_data(log): send_data_to_influxdb(filter_and_group_vm_stats(hostname, host_uuid, data=vm_stats)) return True except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return @@ -322,7 +322,7 @@ def get_vms_and_host_stats(): return host_information, vm_stats except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) finally: conn.close() @@ -334,7 +334,7 @@ def collect_data(): data = get_kvm_stats() return send_data(data) except Exception as e: - print(traceback.format_exc()) + logger.debug(traceback.format_exc()) return False diff --git a/modules/network.py b/modules/network.py index f7840fd..aaaa95b 100644 --- a/modules/network.py +++ b/modules/network.py @@ -1,23 +1,30 @@ import socket import time +import traceback import psutil +from modules import logger + io, last_captured_time = psutil.net_io_counters(pernic=True), time.time() def collect_data(): - global io, last_captured_time - io_2, recent_captured_time = psutil.net_io_counters(pernic=True), time.time() - delay = recent_captured_time - last_captured_time - data = {} - for iface, iface_io in io.items(): - upload_speed, download_speed = io_2[iface].bytes_sent - iface_io.bytes_sent, io_2[iface].bytes_recv - iface_io.bytes_recv - data.update({ - f"{iface}_download": io_2[iface].bytes_recv, - f"{iface}_upload": io_2[iface].bytes_sent, - f"{iface}_upload_speed": round(upload_speed / delay) , - f"{iface}_download_speed": round(download_speed / delay), - }) - io, last_captured_time = io_2, recent_captured_time - data.update({"host": socket.gethostname()}) - return data + try: + global io, last_captured_time + io_2, recent_captured_time = psutil.net_io_counters(pernic=True), time.time() + delay = recent_captured_time - last_captured_time + data = {} + for iface, iface_io in io.items(): + upload_speed, download_speed = io_2[iface].bytes_sent - iface_io.bytes_sent, io_2[iface].bytes_recv - iface_io.bytes_recv + data.update({ + f"{iface}_download": io_2[iface].bytes_recv, + f"{iface}_upload": io_2[iface].bytes_sent, + f"{iface}_upload_speed": round(upload_speed / delay) , + f"{iface}_download_speed": round(download_speed / delay), + }) + io, last_captured_time = io_2, recent_captured_time + data.update({"host": socket.gethostname()}) + return data + except Exception: + logger.debug(f"Failed to capture network stats {traceback.format_exc()}") + return {} \ No newline at end of file diff --git a/modules/partition.py b/modules/partition.py index 96aceeb..3d855cc 100644 --- a/modules/partition.py +++ b/modules/partition.py @@ -1,6 +1,7 @@ import socket import os import psutil +from modules import logger def get_disk_space_usage(device): try: @@ -40,7 +41,7 @@ def get_disk_usage_for_mount_points(mnt_directory='/mnt'): return data except Exception as e: - print(f"Error: {e}") + logger.debug(f"Error: {e}") def collect_data(): @@ -48,7 +49,7 @@ def collect_data(): mnt_directory = '/mnt' return get_disk_usage_for_mount_points(mnt_directory) except Exception as e: - print(str(e)) + logger.debug(str(e)) return {} diff --git a/modules/sensors.py b/modules/sensors.py index 5d508c0..ae57afc 100644 --- a/modules/sensors.py +++ b/modules/sensors.py @@ -2,6 +2,8 @@ import json import subprocess +from modules import logger + def get_sensor_data(): # Run the 'sensors -j' command @@ -28,7 +30,7 @@ def collect_data(): try: return get_sensor_data() except Exception as e: - print(str(e)) + logger.debug(str(e)) return {} diff --git a/modules/uptime.py b/modules/uptime.py index 55025c9..ccd66e2 100644 --- a/modules/uptime.py +++ b/modules/uptime.py @@ -1,6 +1,6 @@ -import subprocess -import re import socket +from modules import logger + def get_system_uptime_seconds(): try: @@ -10,7 +10,7 @@ def get_system_uptime_seconds(): return int(uptime_seconds % 60) except Exception as e: - print(f"Error reading uptime: {e}") + logger.debug(f"Error reading uptime: {e}") return 0 def collect_data(): @@ -22,7 +22,7 @@ def collect_data(): "uptime": uptime_seconds } except Exception as e: - print(str(e)) + logger.debug(str(e)) return {} diff --git a/modules/vmstatus.py b/modules/vmstatus.py index 8eba586..dc3998a 100644 --- a/modules/vmstatus.py +++ b/modules/vmstatus.py @@ -1,8 +1,8 @@ -import os import socket +import traceback from threading import Thread import inotify.adapters -import time +from modules import logger def parse_data(log): log_parts = log.split(':') @@ -32,9 +32,11 @@ def collect_data_continuously(): last_line = lines[-1].strip() t1 = Thread(target=parse_data, args=(last_line,)) t1.run() + except Exception: + logger.debug(f"Failed to capture vm_status {traceback.format_exc()}") finally: i.remove_watch(log_file) -if __name__=="__main__": - collect_data() +# if __name__=="__main__": +# collect_data() diff --git a/requirements.txt b/requirements.txt index 42031a8..f83c191 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ six==1.16.0 typing_extensions==4.8.0 urllib3==2.1.0 libvirt-python==10.7.0 +ujson==5.10.0 diff --git a/main.py b/system_monitor.py similarity index 87% rename from main.py rename to system_monitor.py index 2ecbcb5..1c72c5e 100644 --- a/main.py +++ b/system_monitor.py @@ -5,7 +5,7 @@ import schedule from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG -from modules import MONITORING_INTERVAL +from modules import MONITORING_INTERVAL, logger def load_config(): @@ -26,12 +26,12 @@ def run_module(module_name): for record in data: point = create_influxdb_point(module_name, record) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - print(f"writing record for {module_name} finished.") + logger.debug(f"writing record for {module_name} finished.") else: point = create_influxdb_point(module_name, data) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - print(f"writing record for {module_name} finished.") + logger.debug(f"writing record for {module_name} finished.") if __name__ == '__main__': From 11c45ede71a67fdf1cfdb2def38abd0445d3c6ef Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 27 Sep 2024 14:50:06 +0530 Subject: [PATCH 10/16] - Updated the README.md file for the instructions to setup in a fresh ubuntu server --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index bbd1b13..cc180b4 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,30 @@ Make sure you have the following installed on your system: - InfluxDB - [InfluxDB Python Client](https://github.com/influxdata/influxdb-client-python) +## VM Preparation + +Install `kvmtop` +```commandline +wget https://github.com/cha87de/kvmtop/releases/download/2.1.3/kvmtop_2.1.3_linux_amd64.deb +sudo dpkg -i kvmtop_2.1.3_linux_amd64.deb +``` +Try kvmtop it may fail with error saying missing package libncurses5.so +To fix it add below lines to `/etc/apt/sources.list` +```commandline +deb http://security.ubuntu.com/ubuntu focal-security main +deb http://archive.ubuntu.com/ubuntu/ focal main restricted universe multiverse +deb http://archive.ubuntu.com/ubuntu/ focal-updates main restricted universe multiverse +deb http://security.ubuntu.com/ubuntu/ focal-security main restricted universe multiverse +``` +And execute below commands: +```commandline +sudo apt update +sudo apt upgrade +sudo apt-get install libncurses5 libncurses5:i386 +``` +It will resolve this issue, try `kvmtop` command in terminal and verify everything working as expected + + ## Usage 1. Set up your InfluxDB configuration by creating a `.env` file with the following variables: @@ -19,6 +43,8 @@ Make sure you have the following installed on your system: INFLUX_TOKEN="your-influxdb-token" INFLUX_ORG="your-influxdb-organization" INFLUX_BUCKET="your-influxdb-bucket" + ROOT_PASS="user-passof-ubuntu-terminal" + DISK_PATH="disk-path-to-be-monitored" ``` 2. Run the script: @@ -26,3 +52,4 @@ Make sure you have the following installed on your system: python system_monitor.py ``` + From 412db7a4b07eca60d9be1ac73fd26ae70ea39b1f Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 27 Sep 2024 18:10:42 +0530 Subject: [PATCH 11/16] - Fix issue with uptime calculation --- modules/uptime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/uptime.py b/modules/uptime.py index ccd66e2..e406f81 100644 --- a/modules/uptime.py +++ b/modules/uptime.py @@ -7,7 +7,7 @@ def get_system_uptime_seconds(): with open('/proc/uptime', 'r') as f: # Read the first line from /proc/uptime uptime_seconds = float(f.readline().split()[0]) - return int(uptime_seconds % 60) + return int(uptime_seconds) except Exception as e: logger.debug(f"Error reading uptime: {e}") From d7b60bd598f04f6f7ccc547d9fa69bcfec4755c9 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Sun, 20 Oct 2024 20:59:40 +0530 Subject: [PATCH 12/16] Handle influx DB connection errors --- system_monitor.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/system_monitor.py b/system_monitor.py index 1c72c5e..492dc5f 100644 --- a/system_monitor.py +++ b/system_monitor.py @@ -2,6 +2,8 @@ import time import json import importlib +import traceback + import schedule from connection import create_influxdb_point, write_api, INFLUX_BUCKET, INFLUX_ORG @@ -15,23 +17,27 @@ def load_config(): def run_module(module_name): - module = importlib.import_module(f"modules.{module_name}") - data = module.collect_data() - - if module_name == 'kvm_monitor': - return - if data: - if isinstance(data, list): - # If collect data return multiple records - for record in data: - point = create_influxdb_point(module_name, record) + try: + module = importlib.import_module(f"modules.{module_name}") + data = module.collect_data() + + if module_name == 'kvm_monitor': + return + if data: + if isinstance(data, list): + # If collect data return multiple records + for record in data: + point = create_influxdb_point(module_name, record) + write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) + logger.debug(f"writing record for {module_name} finished.") + + else: + point = create_influxdb_point(module_name, data) write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) logger.debug(f"writing record for {module_name} finished.") + except Exception as e: + logger.debug(traceback.format_exc()) - else: - point = create_influxdb_point(module_name, data) - write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point) - logger.debug(f"writing record for {module_name} finished.") if __name__ == '__main__': From 5288983f588a0bc64417546d64602f17ecb35bab Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 6 Dec 2024 17:43:27 +0530 Subject: [PATCH 13/16] Handle issue with missing interfaces in network io --- modules/network.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/modules/network.py b/modules/network.py index aaaa95b..00089e7 100644 --- a/modules/network.py +++ b/modules/network.py @@ -15,10 +15,12 @@ def collect_data(): delay = recent_captured_time - last_captured_time data = {} for iface, iface_io in io.items(): - upload_speed, download_speed = io_2[iface].bytes_sent - iface_io.bytes_sent, io_2[iface].bytes_recv - iface_io.bytes_recv + bytes_sent_iface = io_2[iface].bytes_sent if io_2.get(iface) else 0 + bytes_recv_iface = io_2[iface].bytes_recv if io_2.get(iface) else 0 + upload_speed, download_speed = bytes_sent_iface - iface_io.bytes_sent, bytes_recv_iface - iface_io.bytes_recv data.update({ - f"{iface}_download": io_2[iface].bytes_recv, - f"{iface}_upload": io_2[iface].bytes_sent, + f"{iface}_download": bytes_recv_iface, + f"{iface}_upload": bytes_sent_iface, f"{iface}_upload_speed": round(upload_speed / delay) , f"{iface}_download_speed": round(download_speed / delay), }) From 742fb44ac3cc50250f350cc7a8989de61a0e23fc Mon Sep 17 00:00:00 2001 From: vigneshwaran ganesan Date: Fri, 3 Jan 2025 15:53:45 +0530 Subject: [PATCH 14/16] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index cc180b4..1ec6a84 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ sudo apt-get install libncurses5 libncurses5:i386 ``` It will resolve this issue, try `kvmtop` command in terminal and verify everything working as expected +Note: If you got stuck with the above error then, remove `libncurses5:i386` and install only `libncurses5` (Sample Error message: E: Unable to locate package libncurses5:i386) + ## Usage From 585f54c65e9bcced712c9a2cf16e1f4b468da6a9 Mon Sep 17 00:00:00 2001 From: Vignesh Date: Fri, 13 Jun 2025 17:07:30 +0530 Subject: [PATCH 15/16] Skil SSL verification for influx DB --- connection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connection.py b/connection.py index fb7bf01..dc70f94 100644 --- a/connection.py +++ b/connection.py @@ -11,7 +11,7 @@ INFLUX_ORG = os.getenv("INFLUX_ORG") INFLUX_BUCKET = os.getenv("INFLUX_BUCKET") -client = influxdb_client.InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG) +client = influxdb_client.InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG, verify_ssl=False) write_api = client.write_api(write_options=SYNCHRONOUS) From 2a939f7ed7c409e5def5a553d343eb2f930528ba Mon Sep 17 00:00:00 2001 From: Vignesh Date: Tue, 16 Sep 2025 19:31:52 +0530 Subject: [PATCH 16/16] Added steps to configure the system_monitor as systemd service --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 1ec6a84..901a1ed 100644 --- a/README.md +++ b/README.md @@ -54,4 +54,13 @@ Note: If you got stuck with the above error then, remove `libncurses5:i386` and python system_monitor.py ``` +## Configure systemd service +1. Pull latest changes to the machine. +2. Update and validate `libmon.service` file in the Libmon directory +3. Copy the directory to `sudo cp libmon.service /etc/systemd/system/libmon.service` +4. Enable the service by `sudo systemctl enable libmon.service` +5. Start the service by `sudo systemctl start libmon.service` +6. Validate the process status by `sudo systemctl status libmon.service and tail -f /home/dev/Libmon/logs/system_monitor.log`. + +