Add metrics monitoring stack with VictoriaMetrics, Grafana, and node_exporter

Implement complete monitoring infrastructure following rick-infra principles:

Components:
- VictoriaMetrics: Prometheus-compatible TSDB (7x less RAM usage)
- Grafana: Visualization dashboard with Authentik OAuth/OIDC integration
- node_exporter: System metrics collection (CPU, memory, disk, network)

Architecture:
- All services run as native systemd binaries (no containers)
- localhost-only binding for security
- Grafana uses native OAuth integration with Authentik (not forward_auth)
- Full systemd security hardening enabled
- Proxied via Caddy at metrics.jnss.me with HTTPS

Role Features:
- Unified metrics role (single role for complete stack)
- Automatic role mapping via Authentik groups:
  - authentik Admins OR grafana-admins -> Admin access
  - grafana-editors -> Editor access
  - All others -> Viewer access
- VictoriaMetrics auto-provisioned as default Grafana datasource
- 12-month metrics retention by default
- Comprehensive documentation included

Security:
- OAuth/OIDC SSO via Authentik
- All metrics services bind to 127.0.0.1 only
- systemd hardening (NoNewPrivileges, ProtectSystem, etc.)
- Grafana accessible only via Caddy HTTPS proxy

Documentation:
- roles/metrics/README.md: Complete role documentation
- docs/metrics-deployment-guide.md: Step-by-step deployment guide

Configuration:
- Updated rick-infra.yml to include metrics deployment
- Grafana port set to 3001 (Gitea uses 3000)
- Ready for multi-host expansion (designed for future node_exporter deployment to production hosts)
This commit is contained in:
2025-12-28 19:18:30 +01:00
parent eca3b4a726
commit 1f3f111d88
19 changed files with 1345 additions and 4 deletions

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 10
allowUiUpdates: true
options:
path: {{ grafana_data_dir }}/dashboards

View File

@@ -0,0 +1,12 @@
apiVersion: 1
datasources:
- name: {{ grafana_datasource_vm_name }}
type: prometheus
access: proxy
url: {{ grafana_datasource_vm_url }}
isDefault: true
editable: true
jsonData:
httpMethod: POST
timeInterval: 15s

View File

@@ -0,0 +1,26 @@
# Grafana Metrics Dashboard
{{ grafana_domain }} {
reverse_proxy http://{{ grafana_listen_address }}:{{ grafana_listen_port }} {
header_up Host {host}
header_up X-Real-IP {remote_host}
header_up X-Forwarded-Proto https
header_up X-Forwarded-For {remote_host}
header_up X-Forwarded-Host {host}
}
# Security headers
header {
X-Frame-Options SAMEORIGIN
X-Content-Type-Options nosniff
X-XSS-Protection "1; mode=block"
Referrer-Policy strict-origin-when-cross-origin
Strict-Transport-Security "max-age=31536000; includeSubDomains; preload"
}
# Logging
log {
output file {{ caddy_log_dir }}/grafana.log
level INFO
format json
}
}

View File

@@ -0,0 +1,68 @@
# Grafana Configuration
# Managed by Ansible - DO NOT EDIT MANUALLY
[paths]
data = {{ grafana_data_dir }}
logs = {{ grafana_logs_dir }}
plugins = {{ grafana_plugins_dir }}
provisioning = {{ grafana_provisioning_dir }}
[server]
http_addr = {{ grafana_listen_address }}
http_port = {{ grafana_listen_port }}
domain = {{ grafana_domain }}
root_url = {{ grafana_root_url }}
enforce_domain = true
enable_gzip = true
[database]
type = {{ grafana_database_type }}
{% if grafana_database_type == 'sqlite3' %}
path = {{ grafana_database_path }}
{% endif %}
[security]
admin_user = {{ grafana_admin_user }}
admin_password = {{ grafana_admin_password }}
secret_key = {{ vault_grafana_secret_key }}
cookie_secure = {{ grafana_cookie_secure | lower }}
cookie_samesite = {{ grafana_cookie_samesite }}
disable_gravatar = true
disable_initial_admin_creation = false
[users]
allow_sign_up = {{ grafana_allow_signup | lower }}
allow_org_create = false
auto_assign_org = true
auto_assign_org_role = Viewer
[auth]
disable_login_form = {{ grafana_disable_login_form | lower }}
oauth_auto_login = false
{% if grafana_oauth_enabled %}
[auth.generic_oauth]
enabled = true
name = {{ grafana_oauth_name }}
client_id = {{ grafana_oauth_client_id }}
client_secret = {{ grafana_oauth_client_secret }}
scopes = {{ grafana_oauth_scopes }}
auth_url = {{ grafana_oauth_auth_url }}
token_url = {{ grafana_oauth_token_url }}
api_url = {{ grafana_oauth_api_url }}
allow_sign_up = {{ grafana_oauth_allow_sign_up | lower }}
role_attribute_path = {{ grafana_oauth_role_attribute_path }}
use_pkce = true
{% endif %}
[log]
mode = console
level = info
[analytics]
reporting_enabled = false
check_for_updates = false
check_for_plugin_updates = false
[snapshots]
external_enabled = false

View File

@@ -0,0 +1,36 @@
[Unit]
Description=Grafana visualization platform
Documentation=https://grafana.com/docs/
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ grafana_user }}
Group={{ grafana_group }}
WorkingDirectory=/opt/grafana
ExecStart=/opt/grafana/bin/grafana-server \
--config=/etc/grafana/grafana.ini \
--homepath=/opt/grafana
Restart=on-failure
RestartSec=5s
# Security hardening
{% if grafana_systemd_security %}
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths={{ grafana_data_dir }} {{ grafana_logs_dir }}
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictNamespaces=true
LockPersonality=true
{% endif %}
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,42 @@
[Unit]
Description=Prometheus Node Exporter
Documentation=https://github.com/prometheus/node_exporter
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ node_exporter_user }}
Group={{ node_exporter_group }}
ExecStart=/usr/local/bin/node_exporter \
--web.listen-address={{ node_exporter_listen_address }} \
{% for collector in node_exporter_enabled_collectors %}
--collector.{{ collector }} \
{% endfor %}
{% for collector in node_exporter_disabled_collectors %}
--no-collector.{{ collector }} \
{% endfor %}
--collector.filesystem.fs-types-exclude="{{ node_exporter_filesystem_ignored_fs_types | join('|') }}" \
--collector.filesystem.mount-points-exclude="{{ node_exporter_filesystem_ignored_mount_points | join('|') }}"
Restart=on-failure
RestartSec=5s
# Security hardening
{% if node_exporter_systemd_security %}
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictNamespaces=true
LockPersonality=true
ReadOnlyPaths=/
{% endif %}
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,22 @@
global:
scrape_interval: {{ victoriametrics_scrape_interval }}
scrape_timeout: {{ victoriametrics_scrape_timeout }}
external_labels:
environment: '{{ "homelab" if inventory_hostname in groups["homelab"] else "production" }}'
host: '{{ inventory_hostname }}'
scrape_configs:
# VictoriaMetrics self-monitoring
- job_name: 'victoriametrics'
static_configs:
- targets: ['{{ victoriametrics_listen_address }}']
labels:
service: 'victoriametrics'
# Node exporter for system metrics
- job_name: 'node'
static_configs:
- targets: ['{{ node_exporter_listen_address }}']
labels:
service: 'node_exporter'
instance: '{{ inventory_hostname }}'

View File

@@ -0,0 +1,41 @@
[Unit]
Description=VictoriaMetrics time-series database
Documentation=https://docs.victoriametrics.com/
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User={{ victoriametrics_user }}
Group={{ victoriametrics_group }}
ExecStart=/usr/local/bin/victoria-metrics-prod \
-storageDataPath={{ victoriametrics_data_dir }} \
-retentionPeriod={{ victoriametrics_retention_period }} \
-httpListenAddr={{ victoriametrics_listen_address }} \
-promscrape.config={{ victoriametrics_scrape_config_file }} \
-memory.allowedPercent={{ victoriametrics_memory_allowed_percent }} \
-storage.minFreeDiskSpaceBytes={{ victoriametrics_storage_min_free_disk_space_bytes }}
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s
# Security hardening
{% if victoriametrics_systemd_security %}
NoNewPrivileges=true
PrivateTmp=true
ProtectSystem=strict
ProtectHome=true
ReadWritePaths={{ victoriametrics_data_dir }}
ProtectKernelTunables=true
ProtectKernelModules=true
ProtectControlGroups=true
RestrictRealtime=true
RestrictNamespaces=true
LockPersonality=true
{% endif %}
[Install]
WantedBy=multi-user.target