infrastructure/scripts/check-deployment.sh

# SPDX-FileCopyrightText: 2024 Maurice Debray <maurice.debray@dgnum.eu>
#
# SPDX-License-Identifier: EUPL-1.2

set -o errexit
set -o nounset
set -o pipefail
shopt -s lastpipe

usage="$(basename "$0") [-h] [--diff] [NODE]
Check if deployed config is actually the one on master
By default check all nodes

where:
    -h        Show this help text
    --diff    Show diff with nvd

Exemple:
    check-deployment web01"

while [[ $# -gt 0 ]]; do
  case "$1" in
  --help | -h)
    echo "$usage"
    exit 0
    ;;

  --diff)
    diff=y
    ;;

  *)
    if [[ -z ${node-} ]]; then
      node="$1"
    else
      echo "Too many arguments. Help:"
      echo "$usage"
      exit 1
    fi
    ;;
  esac
  shift
done

#############
# go to tmp #
#############

TMP=$(mktemp -d)
GIT_TOP_LEVEL=$(git rev-parse --show-toplevel)

echo "Cloning local main..."
git clone -q --branch main --single-branch "$GIT_TOP_LEVEL" "$TMP"
pushd "$TMP" >/dev/null || exit 2

####################
# Evaluate configs #
####################

colmena_failed() {
  >&2 echo "Colmena failed. Check your config. Logs:"
  >&2 cat "$COLMENA_LOGS"
  exit 3
}

COLMENA_LOGS=$(mktemp)

echo "Evaluating configs..."
# Disable warning because of '${}'
# shellcheck disable=SC2016
RESULTS=$(colmena eval -E '{ nodes, lib, ...}: lib.mapAttrsToList (k: v: { machine = k; path = v.config.system.build.toplevel; drv = v.config.system.build.toplevel.drvPath; domain = "${v.config.networking.hostName}.${v.config.networking.domain}"; }) nodes' 2>"$COLMENA_LOGS" || colmena_failed)

rm "$COLMENA_LOGS"
echo "Evaluation finished"

#####################################
# retrieve and check current-system #
#####################################

retrieve_current_system() {
  # TODO implement a less invasive method
  ssh -n "root@$1" "readlink -f /run/current-system"
}

return_status=0
echo "$RESULTS" | jq -c '.[]' |
  while IFS=$'\n' read -r c; do

    machine=$(echo "$c" | jq -r '.machine')
    if [[ -n ${node-} ]] && [[ "$machine" != "$node" ]]; then
      echo "Skipping ${machine}"
      continue
    fi
    expected_path=$(echo "$c" | jq -r '.path')
    domain=$(echo "$c" | jq -r '.domain')
    drv_path=$(echo "$c" | jq -r '.drv')

    err=0
    current_path=$(retrieve_current_system "$domain") || err=1
    if [[ "1" == "${err}" ]]; then
      echo "❌ failed to contact $domain !"
      continue
    fi

    if [ "$expected_path" == "$current_path" ]; then
      echo "✅ $machine -> OK"
    elif [[ -n ${diff-} ]]; then
      nix-copy-closure --from "root@$domain" "$current_path"
      nix-store -r "$drv_path"
      echo "$machine -> error. nvd output:"
      nvd diff "$expected_path" "$current_path"
      return_status=1
    else
      echo "☠️ $machine -> error:"
      echo "   - Expected system: $expected_path"
      echo "   - Current system:  $current_path"
      return_status=1
    fi
  done

popd >/dev/null || exit 2
rm -r "$TMP"

exit $return_status