diff --git a/disk_monitor.py b/disk_monitor.py new file mode 100644 index 0000000..f75486c --- /dev/null +++ b/disk_monitor.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +import os +import sys +import time +import uuid +import json +import shutil +import sqlite3 +import urllib.parse +import urllib.request + +DB_PATH = os.environ.get("DISKMON_DB", "/var/lib/diskmon/diskmon.sqlite3") +MOUNT_PATH = os.environ.get("DISKMON_MOUNT", "/") +MATRIX_HOMESERVER = os.environ["MATRIX_HOMESERVER"].rstrip("/") +MATRIX_ROOM_ID = os.environ["MATRIX_ROOM_ID"] +MATRIX_ACCESS_TOKEN = os.environ["MATRIX_ACCESS_TOKEN"] + +# thresholds +TEN_MIN_SECONDS = 10 * 60 +ONE_HOUR_SECONDS = 60 * 60 +ONE_GIB = 1024 ** 3 +TEN_GIB = 10 * ONE_GIB + +# cooldowns to avoid spam +WARNING_COOLDOWN = 30 * 60 +CRITICAL_COOLDOWN = 60 * 60 + + +def ensure_db(conn: sqlite3.Connection) -> None: + conn.execute(""" + CREATE TABLE IF NOT EXISTS samples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + ts INTEGER NOT NULL, + mount TEXT NOT NULL, + used_bytes INTEGER NOT NULL, + avail_bytes INTEGER NOT NULL, + total_bytes INTEGER NOT NULL + ) + """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS alerts ( + key TEXT PRIMARY KEY, + last_sent_ts INTEGER NOT NULL + ) + """) + conn.commit() + + +def get_disk_usage(path: str): + usage = shutil.disk_usage(path) + total = usage.total + used = usage.used + free = usage.free + return total, used, free + + +def bytes_human(n: int) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB", "PiB"] + value = float(n) + for unit in units: + if value < 1024 or unit == units[-1]: + if unit == "B": + return f"{int(value)} {unit}" + return f"{value:.1f} {unit}" + value /= 1024 + return f"{n} B" + + +def percent_used(used: int, total: int) -> float: + if total == 0: + return 0.0 + return (used / total) * 100.0 + + +def insert_sample(conn: sqlite3.Connection, ts: int, mount: str, used: int, avail: int, total: int) -> None: + conn.execute( + "INSERT INTO samples (ts, mount, used_bytes, avail_bytes, total_bytes) VALUES (?, ?, ?, ?, ?)", + (ts, mount, used, avail, total), + ) + conn.commit() + + +def prune_old_samples(conn: sqlite3.Connection, now_ts: int) -> None: + # keep 2 days of history, more than enough + cutoff = now_ts - (2 * 24 * 60 * 60) + conn.execute("DELETE FROM samples WHERE ts < ?", (cutoff,)) + conn.commit() + + +def get_oldest_sample_at_least(conn: sqlite3.Connection, mount: str, min_age_seconds: int, now_ts: int): + cutoff_ts = now_ts - min_age_seconds + row = conn.execute( + """ + SELECT ts, used_bytes, avail_bytes, total_bytes + FROM samples + WHERE mount = ? AND ts <= ? + ORDER BY ts DESC + LIMIT 1 + """, + (mount, cutoff_ts), + ).fetchone() + return row + + +def should_send_alert(conn: sqlite3.Connection, key: str, now_ts: int, cooldown_seconds: int) -> bool: + row = conn.execute("SELECT last_sent_ts FROM alerts WHERE key = ?", (key,)).fetchone() + if row is None: + return True + return (now_ts - row[0]) >= cooldown_seconds + + +def mark_alert_sent(conn: sqlite3.Connection, key: str, now_ts: int) -> None: + conn.execute( + """ + INSERT INTO alerts (key, last_sent_ts) + VALUES (?, ?) + ON CONFLICT(key) DO UPDATE SET last_sent_ts = excluded.last_sent_ts + """, + (key, now_ts), + ) + conn.commit() + + +def send_matrix_message(body: str) -> None: + txn_id = str(uuid.uuid4()) + room_id = urllib.parse.quote(MATRIX_ROOM_ID, safe="") + event_type = "m.room.message" + url = f"{MATRIX_HOMESERVER}/_matrix/client/v3/rooms/{room_id}/send/{event_type}/{txn_id}" + + payload = { + "msgtype": "m.text", + "body": body, + } + + req = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + method="PUT", + headers={ + "Authorization": f"Bearer {MATRIX_ACCESS_TOKEN}", + "Content-Type": "application/json", + }, + ) + + with urllib.request.urlopen(req, timeout=20) as resp: + if resp.status < 200 or resp.status >= 300: + raise RuntimeError(f"Matrix send failed with HTTP {resp.status}") + + +def format_report(ts: int, mount: str, used: int, avail: int, total: int) -> str: + pct = percent_used(used, total) + local_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(ts)) + return ( + f"[VPS Storage Report]\n" + f"Mount: {mount}\n" + f"Used: {bytes_human(used)}\n" + f"Available: {bytes_human(avail)}\n" + f"Total: {bytes_human(total)}\n" + f"Usage: {pct:.1f}%\n" + f"Timestamp: {local_time}" + ) + + +def format_change_alert(window_name: str, delta: int, prev_used: int, current_used: int, ts: int, mount: str) -> str: + local_time = time.strftime("%Y-%m-%d %H:%M:%S %Z", time.localtime(ts)) + return ( + f"[Storage Alert]\n" + f"Mount: {mount}\n" + f"Used space increased by {bytes_human(delta)} in {window_name}\n" + f"Previous used: {bytes_human(prev_used)}\n" + f"Current used: {bytes_human(current_used)}\n" + f"Timestamp: {local_time}" + ) + + +def do_report(conn: sqlite3.Connection) -> None: + now_ts = int(time.time()) + total, used, avail = get_disk_usage(MOUNT_PATH) + insert_sample(conn, now_ts, MOUNT_PATH, used, avail, total) + prune_old_samples(conn, now_ts) + send_matrix_message(format_report(now_ts, MOUNT_PATH, used, avail, total)) + + +def do_sample(conn: sqlite3.Connection) -> None: + now_ts = int(time.time()) + total, used, avail = get_disk_usage(MOUNT_PATH) + insert_sample(conn, now_ts, MOUNT_PATH, used, avail, total) + prune_old_samples(conn, now_ts) + + ten_min_sample = get_oldest_sample_at_least(conn, MOUNT_PATH, TEN_MIN_SECONDS, now_ts) + if ten_min_sample is not None: + prev_ts, prev_used, _prev_avail, _prev_total = ten_min_sample + delta = used - prev_used + if delta >= ONE_GIB and should_send_alert(conn, "warn_10m", now_ts, WARNING_COOLDOWN): + send_matrix_message(format_change_alert("10 minutes", delta, prev_used, used, now_ts, MOUNT_PATH)) + mark_alert_sent(conn, "warn_10m", now_ts) + + one_hour_sample = get_oldest_sample_at_least(conn, MOUNT_PATH, ONE_HOUR_SECONDS, now_ts) + if one_hour_sample is not None: + prev_ts, prev_used, _prev_avail, _prev_total = one_hour_sample + delta = used - prev_used + if delta >= TEN_GIB and should_send_alert(conn, "crit_60m", now_ts, CRITICAL_COOLDOWN): + send_matrix_message(format_change_alert("60 minutes", delta, prev_used, used, now_ts, MOUNT_PATH)) + mark_alert_sent(conn, "crit_60m", now_ts) + + +def main(): + if len(sys.argv) != 2 or sys.argv[1] not in {"report", "sample"}: + print("Usage: disk_monitor.py [report|sample]", file=sys.stderr) + sys.exit(2) + + os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) + conn = sqlite3.connect(DB_PATH) + try: + ensure_db(conn) + if sys.argv[1] == "report": + do_report(conn) + else: + do_sample(conn) + finally: + conn.close() + + +if __name__ == "__main__": + main() diff --git a/diskmon-report.service b/diskmon-report.service new file mode 100644 index 0000000..171e250 --- /dev/null +++ b/diskmon-report.service @@ -0,0 +1,7 @@ +[Unit] +Description=Disk monitor morning report + +[Service] +Type=oneshot +EnvironmentFile=/etc/diskmon.env +ExecStart=/usr/bin/python3 /opt/diskmon/disk_monitor.py report diff --git a/diskmon-report.timer b/diskmon-report.timer new file mode 100644 index 0000000..538a552 --- /dev/null +++ b/diskmon-report.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Run disk monitor morning report daily + +[Timer] +OnCalendar=*-*-* 09:00:00 +Persistent=true +Unit=diskmon-report.service + +[Install] +WantedBy=timers.target diff --git a/diskmon-sample.service b/diskmon-sample.service new file mode 100644 index 0000000..6ae4706 --- /dev/null +++ b/diskmon-sample.service @@ -0,0 +1,7 @@ +[Unit] +Description=Disk monitor sample job + +[Service] +Type=oneshot +EnvironmentFile=/etc/diskmon.env +ExecStart=/usr/bin/python3 /opt/diskmon/disk_monitor.py sample diff --git a/diskmon-sample.timer b/diskmon-sample.timer new file mode 100644 index 0000000..7632569 --- /dev/null +++ b/diskmon-sample.timer @@ -0,0 +1,10 @@ +[Unit] +Description=Run disk monitor sample every 5 minutes + +[Timer] +OnBootSec=2min +OnUnitActiveSec=5min +Unit=diskmon-sample.service + +[Install] +WantedBy=timers.target diff --git a/diskmon.env b/diskmon.env new file mode 100644 index 0000000..1216da0 --- /dev/null +++ b/diskmon.env @@ -0,0 +1,5 @@ +MATRIX_HOMESERVER=https://matrix.example.com +MATRIX_ROOM_ID=!yourRoomId:example.com +MATRIX_ACCESS_TOKEN=your_access_token_here +DISKMON_DB=/var/lib/diskmon/diskmon.sqlite3 +DISKMON_MOUNT=/