feat(sterni/ingeborg): set up monitoring via netdata
Main objective was to get SMART/md monitoring working, alerts go (via some awful glue code) to #sterni.lv on hackint. Bot nick should also be registered in the future. Change-Id: Ia73c5a64ee9f6df62f5fbe21fc1606477e3d6e73 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10174 Reviewed-by: sterni <sternenseemann@systemli.org> Tested-by: BuildkiteCI
This commit is contained in:
parent
c6c1c9f8fc
commit
60ca9ba437
4 changed files with 156 additions and 0 deletions
|
@ -9,6 +9,7 @@
|
||||||
./network.nix
|
./network.nix
|
||||||
# (More or less) pluggable service configuration
|
# (More or less) pluggable service configuration
|
||||||
(depot.path.origSrc + "/ops/modules/btrfs-auto-scrub.nix")
|
(depot.path.origSrc + "/ops/modules/btrfs-auto-scrub.nix")
|
||||||
|
./monitoring.nix
|
||||||
];
|
];
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
|
|
23
users/sterni/machines/ingeborg/irccat.nix
Normal file
23
users/sterni/machines/ingeborg/irccat.nix
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
{ depot, config, pkgs, lib, ... }:
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
(depot.path.origSrc + "/ops/modules/irccat.nix")
|
||||||
|
];
|
||||||
|
|
||||||
|
config = {
|
||||||
|
services.depot.irccat = {
|
||||||
|
enable = true;
|
||||||
|
secretsFile = builtins.toFile "empty.json" "{}"; # TODO(sterni): register
|
||||||
|
config = {
|
||||||
|
tcp.listen = ":4722"; # ircc
|
||||||
|
irc = {
|
||||||
|
server = "irc.hackint.org:6697";
|
||||||
|
tls = true;
|
||||||
|
nick = config.networking.hostName;
|
||||||
|
realname = "irccat";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
131
users/sterni/machines/ingeborg/monitoring.nix
Normal file
131
users/sterni/machines/ingeborg/monitoring.nix
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
{ pkgs, lib, config, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
ircChannel = "#sterni.lv";
|
||||||
|
irccatPort =
|
||||||
|
builtins.replaceStrings [ ":" ] [ "" ]
|
||||||
|
config.services.depot.irccat.config.tcp.listen;
|
||||||
|
|
||||||
|
mkIrcMessager =
|
||||||
|
{ name
|
||||||
|
, msgExpr
|
||||||
|
}:
|
||||||
|
pkgs.writeShellScript name ''
|
||||||
|
set -euo pipefail
|
||||||
|
printf '%s %s\n' ${lib.escapeShellArg ircChannel} ${msgExpr} | \
|
||||||
|
${lib.getBin pkgs.netcat-openbsd}/bin/nc -N localhost ${irccatPort}
|
||||||
|
'';
|
||||||
|
|
||||||
|
netdataPort = 19999;
|
||||||
|
in
|
||||||
|
|
||||||
|
{
|
||||||
|
imports = [
|
||||||
|
./irccat.nix
|
||||||
|
];
|
||||||
|
|
||||||
|
config = {
|
||||||
|
services.depot.irccat.config.irc.channels = [
|
||||||
|
ircChannel
|
||||||
|
];
|
||||||
|
|
||||||
|
# Since we have irccat we can wire up mdadm --monitor
|
||||||
|
boot.swraid.mdadmConf = ''
|
||||||
|
PROGRAM ${
|
||||||
|
mkIrcMessager {
|
||||||
|
name = "mdmonitor-to-irc";
|
||||||
|
# prog EVENT MD_DEVICE COMPONENT_DEVICE
|
||||||
|
msgExpr = ''"mdmonitor: $1($2''${3:+, $3})"'';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
'';
|
||||||
|
|
||||||
|
# TODO(sterni): irc notifications (?)
|
||||||
|
services = {
|
||||||
|
smartd = {
|
||||||
|
enable = true;
|
||||||
|
autodetect = true;
|
||||||
|
# Short self test every day 03:00
|
||||||
|
# Long self test every tuesday 05:00
|
||||||
|
defaults.autodetected = "-a -o on -s (S/../.././03|L/../../2/05)";
|
||||||
|
extraOptions = [
|
||||||
|
"-A"
|
||||||
|
"/var/log/smartd/"
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
netdata = {
|
||||||
|
enable = true;
|
||||||
|
config = {
|
||||||
|
logs = {
|
||||||
|
access = "syslog";
|
||||||
|
error = "syslog";
|
||||||
|
debug = "syslog";
|
||||||
|
health = "syslog";
|
||||||
|
collector = "syslog";
|
||||||
|
};
|
||||||
|
web = {
|
||||||
|
"default port" = toString netdataPort;
|
||||||
|
"bind to" = "localhost:${toString netdataPort}";
|
||||||
|
};
|
||||||
|
health = {
|
||||||
|
"script to execute on alarm" = pkgs.writeShellScript "simple-alarm-notify" ''
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# This humongous list is copied over from netdata's alarm-notify.sh
|
||||||
|
roles="''${1}" # the roles that should be notified for this event
|
||||||
|
args_host="''${2}" # the host generated this event
|
||||||
|
unique_id="''${3}" # the unique id of this event
|
||||||
|
alarm_id="''${4}" # the unique id of the alarm that generated this event
|
||||||
|
event_id="''${5}" # the incremental id of the event, for this alarm id
|
||||||
|
when="''${6}" # the timestamp this event occurred
|
||||||
|
name="''${7}" # the name of the alarm, as given in netdata health.d entries
|
||||||
|
chart="''${8}" # the name of the chart (type.id)
|
||||||
|
status="''${9}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
|
||||||
|
old_status="''${10}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
|
||||||
|
value="''${11}" # the current value of the alarm
|
||||||
|
old_value="''${12}" # the previous value of the alarm
|
||||||
|
src="''${13}" # the line number and file the alarm has been configured
|
||||||
|
duration="''${14}" # the duration in seconds of the previous alarm state
|
||||||
|
non_clear_duration="''${15}" # the total duration in seconds this is/was non-clear
|
||||||
|
units="''${16}" # the units of the value
|
||||||
|
info="''${17}" # a short description of the alarm
|
||||||
|
value_string="''${18}" # friendly value (with units)
|
||||||
|
# shellcheck disable=SC2034
|
||||||
|
# variable is unused, but https://github.com/netdata/netdata/pull/5164#discussion_r255572947
|
||||||
|
old_value_string="''${19}" # friendly old value (with units), previously named "old_value_string"
|
||||||
|
calc_expression="''${20}" # contains the expression that was evaluated to trigger the alarm
|
||||||
|
calc_param_values="''${21}" # the values of the parameters in the expression, at the time of the evaluation
|
||||||
|
total_warnings="''${22}" # Total number of alarms in WARNING state
|
||||||
|
total_critical="''${23}" # Total number of alarms in CRITICAL state
|
||||||
|
total_warn_alarms="''${24}" # List of alarms in warning state
|
||||||
|
total_crit_alarms="''${25}" # List of alarms in critical state
|
||||||
|
classification="''${26}" # The class field from .conf files
|
||||||
|
edit_command_line="''${27}" # The command to edit the alarm, with the line number
|
||||||
|
child_machine_guid="''${28}" # the machine_guid of the child
|
||||||
|
transition_id="''${29}" # the transition_id of the alert
|
||||||
|
summary="''${30}" # the summary text field of the alert
|
||||||
|
|
||||||
|
# Verify that they haven't extended the arg list
|
||||||
|
ARG_COUNT_EXPECTED=30
|
||||||
|
|
||||||
|
if [[ "$#" != "$ARG_COUNT_EXPECTED" ]]; then
|
||||||
|
echo "$0: WARNING: unexpected number of arguments: $#. Did netdata add more?" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
MSG="netdata: $status ''${name//_/ } ($chart): ''${summary//_/ } = $value_string"
|
||||||
|
|
||||||
|
echo "$0: INFO: sending message: $MSG" >&2
|
||||||
|
${
|
||||||
|
mkIrcMessager {
|
||||||
|
name = "trivial-send-to-irc";
|
||||||
|
msgExpr = "\"$1\"";
|
||||||
|
}
|
||||||
|
} "$MSG"
|
||||||
|
'';
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
|
@ -65,6 +65,7 @@ in
|
||||||
pkgs.htop
|
pkgs.htop
|
||||||
pkgs.foot.terminfo
|
pkgs.foot.terminfo
|
||||||
pkgs.vim
|
pkgs.vim
|
||||||
|
pkgs.smartmontools
|
||||||
];
|
];
|
||||||
|
|
||||||
security.acme = {
|
security.acme = {
|
||||||
|
|
Loading…
Reference in a new issue