From 873ae2c1357d9e54deb3604896f65b844ef4580b Mon Sep 17 00:00:00 2001
From: Jens Nolte <git@queezle.net>
Date: Sun, 29 Jan 2023 18:52:43 +0100
Subject: [PATCH] WIP monitoring

---
 modules/monitoring/grafana-agent.nix     | 105 +++++++++++++++++++++
 modules/monitoring/idle-load.patch       |  17 ++++
 modules/monitoring/monitoring-server.nix | 115 +++++++++++++++++++++++
 3 files changed, 237 insertions(+)
 create mode 100644 modules/monitoring/grafana-agent.nix
 create mode 100644 modules/monitoring/idle-load.patch
 create mode 100644 modules/monitoring/monitoring-server.nix

diff --git a/modules/monitoring/grafana-agent.nix b/modules/monitoring/grafana-agent.nix
new file mode 100644
index 0000000..2f6e6ab
--- /dev/null
+++ b/modules/monitoring/grafana-agent.nix
@@ -0,0 +1,105 @@
+{ config, lib, pkgs, ... }:
+with lib;
+
+let
+  cfg = config.queezle.monitoring.grafana-agent;
+  settingsFormat = pkgs.formats.yaml { };
+  configFile = settingsFormat.generate "grafana-agent.yaml" cfg.settings;
+  patched-grafana-agent = pkgs.grafana-agent.overrideAttrs (old: {
+    preBuild = ''
+      chmod +w vendor/github.com/prometheus/prometheus/tsdb/wlog/watcher.go
+      chmod +w vendor/github.com/prometheus/prometheus/tsdb/wlog/
+      patch -p1 -i ${./idle-load.patch}
+    '';
+  });
+in {
+  options.queezle.monitoring.grafana-agent = {
+    enable = lib.mkEnableOption "prometheus agent config";
+
+    remoteWriteUrl = mkOption {
+      type = types.str;
+      default = "http://prometheus:99/api/v1/write";
+    };
+
+    settings = mkOption {
+      description = lib.mdDoc ''
+        Configuration for `grafana-agent`.
+        See https://grafana.com/docs/agent/latest/configuration/
+      '';
+
+      type = types.submodule {
+        freeformType = settingsFormat.type;
+      };
+    };
+  };
+
+  config = mkIf cfg.enable {
+
+    queezle.monitoring.grafana-agent.settings = {
+      metrics = {
+        wal_directory = "\${STATE_DIRECTORY}";
+
+        global.remote_write = [{
+          url = cfg.remoteWriteUrl;
+        }];
+
+        #configs = [{
+        #  scrape_configs = ...
+        #}];
+      };
+      integrations = {
+        # Scrape metrics about the agent itself
+        agent = {
+          enabled = true;
+          scrape_integration = true;
+          instance = config.networking.hostName;
+        };
+
+        # Set up integrated node exporter
+        node_exporter = {
+          enabled = true;
+          scrape_integration = true;
+          instance = config.networking.hostName;
+          enable_collectors = [
+            "systemd"
+          ];
+        };
+      };
+    };
+
+    systemd.services.grafana-agent = {
+      wantedBy = [ "multi-user.target" ];
+      serviceConfig = {
+        ExecStart = "${patched-grafana-agent}/bin/agent -disable-reporting -config.expand-env -config.file ${configFile}";
+        RestartSec = 10;
+        Restart = "always";
+        User = "grafana-agent";
+        Group = "grafana-agent";
+        SupplementaryGroups = [
+          # Allow to read the systemd journal for loki log forwarding
+          "systemd-journal"
+        ];
+        StateDirectory = "grafana-agent";
+        Type = "exec";
+
+        # NOTE: No DynamicUser since that prevents the node_exporter systemd
+        # integration from connecting to the socket (because the dynamic user
+        # has an unknown UID in the root UID namespace, so dbus access control
+        # fails).
+        ProtectSystem = "strict";
+        ProtectHome = "tmpfs";
+        RemoveIPC = true;
+        PrivateTmp = true;
+        NoNewPrivileges = true;
+        RestrictSUIDSGID = true;
+      };
+    };
+
+    users.users.grafana-agent = {
+      isSystemUser = true;
+      group = "grafana-agent";
+    };
+    users.groups.grafana-agent = {};
+  };
+}
+
diff --git a/modules/monitoring/idle-load.patch b/modules/monitoring/idle-load.patch
new file mode 100644
index 0000000..435a093
--- /dev/null
+++ b/modules/monitoring/idle-load.patch
@@ -0,0 +1,17 @@
+diff --git a/vendor/github.com/prometheus/prometheus/tsdb/wlog/watcher.go b/vendor/github.com/prometheus/prometheus/tsdb/wlog/watcher.go
+index 5d7c84d3..ef79f016 100644
+--- a/vendor/github.com/prometheus/prometheus/tsdb/wlog/watcher.go
++++ b/vendor/github.com/prometheus/prometheus/tsdb/wlog/watcher.go
+@@ -34,9 +34,9 @@ import (
+ )
+ 
+ const (
+-	readPeriod         = 10 * time.Millisecond
+-	checkpointPeriod   = 5 * time.Second
+-	segmentCheckPeriod = 100 * time.Millisecond
++	readPeriod         = 1 * time.Second
++	checkpointPeriod   = 10 * time.Second
++	segmentCheckPeriod = 5 * time.Second
+ 	consumer           = "consumer"
+ )
+ 
diff --git a/modules/monitoring/monitoring-server.nix b/modules/monitoring/monitoring-server.nix
new file mode 100644
index 0000000..03f28f3
--- /dev/null
+++ b/modules/monitoring/monitoring-server.nix
@@ -0,0 +1,115 @@
+{ config, lib, pkgs, ... }:
+with lib;
+
+let
+  cfg = config.queezle.monitoring.server;
+in {
+  options.queezle.monitoring.server.enable = lib.mkEnableOption "prometheus and grafana server";
+
+  config = mkIf cfg.enable {
+
+    # Local agent simplifies and unifies node scraping
+    queezle.monitoring.grafana-agent.enable = true;
+
+    services.grafana = {
+      enable = true;
+      settings = {
+        analytics = {
+          reporting_enabled = false;
+          check_for_updates = false;
+        };
+        server = {
+          domain = "grafana.queezle.xyz";
+          root_url = "https://grafana.queezle.xyz/";
+
+          #protocol = "http";
+          #http_addr = "127.0.0.1";
+          #http_port = 3000;
+          protocol = "socket";
+          socket = "/run/nginx-grafana/grafana.sock";
+          socket_mode = "0777";
+        };
+      };
+      # Currently using sqlite database
+      #database = {
+      #  type = "postgres";
+      #  user = "grafana";
+      #  host = "/var/run/postgresql/";
+      #};
+      provision.enable = true;
+      provision.datasources.settings.datasources = [
+        {
+          name = "prometheus";
+          type = "prometheus";
+          url = "http://localhost:9090";
+        }
+      ];
+      #provision.dashboards = [
+      #  {
+      #    name = "yaner dashboards";
+      #    options.path = ./dashboards;
+      #    options.foldersFromFilesStructure = true;
+      #    updateIntervalSeconds = 999999999;
+      #  }
+      #];
+    };
+    systemd.tmpfiles.rules = [ "d /run/nginx-grafana 0750 grafana nginx" ];
+
+    services.prometheus = {
+      enable = true;
+      stateDir = "prometheus";
+      #listenAddress = "127.0.0.1"; # port 9090
+      extraFlags = [
+        "--storage.tsdb.retention.size=32GB"
+        "--web.enable-remote-write-receiver"
+      ];
+      scrapeConfigs = [
+        {
+          job_name = "prometheus";
+          static_configs = [
+            {
+              targets = [ "127.0.0.1:9090" ];
+              labels.instance = config.networking.hostName;
+            }
+          ];
+        }
+        {
+          job_name = "grafana";
+          static_configs = [
+            {
+              targets = [ "127.0.0.1:3000" ];
+              labels.instance = config.networking.hostName;
+            }
+          ];
+        }
+      ];
+    };
+
+    # Reverse proxy for remote write endpoint
+    # Encrypted/authenticated by using wireguard and firewall rules
+    # (i.e. no HTTPS to isolate metrics from potential letsencrypt problems)
+    services.nginx = {
+      virtualHosts = {
+        "prometheus" = {
+          listen = [
+            {
+              # TODO limit to vpn-only ip?
+              addr = "[::]";
+              port = 99;
+            }
+          ];
+          forceSSL = false;
+          locations = {
+            "/" = {
+              return = "404";
+            };
+            "= /api/v1/write" = {
+              proxyPass = "http://127.0.0.1:9090";
+              proxyWebsockets = true;
+            };
+          };
+        };
+      };
+    };
+  };
+}
-- 
GitLab