tvl-depot/ops/pipelines/static-pipeline.yaml
Florian Klink bb5d7c9678 feat(ops/pipelines): support buildkite retries
cl/12228 did enable automatic retries for some flaky tests, which
generally did work, as can be seen in
https://buildkite.com/tvl/depot/builds/35893

However, "🦆" still reports as failing, because we check the number
of steps to be nonzero, which is not the case if retries have happened.

We cannot check for the overall status of the build, as it's still
"RUNNING", but instead of counting all failed steps so far, we can query
all failed jobs and then filter out the ones that were already retried.

Change-Id: Ib9d27587c8a8ba7970850812c4302fecdc4482e7
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12233
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
2024-08-19 10:07:08 +00:00

136 lines
4.9 KiB
YAML

# This file defines the static Buildkite pipeline which attempts to
# create the dynamic pipeline of all depot targets.
#
# If something fails during the creation of the pipeline, the fallback
# is executed instead which will simply report an error to Gerrit.
---
env:
BUILDKITE_TOKEN_PATH: /run/agenix/buildkite-graphql-token
steps:
# Run pipeline for tvl-kit when new commits arrive on canon. Since
# it is not part of the depot build tree, this is a useful
# verification to ensure we don't break external things (too much).
- trigger: "tvl-kit"
async: true
label: ":fork:"
branches: "refs/heads/canon"
build:
message: "Verification triggered by ${BUILDKITE_COMMIT}"
# Run pipeline for tvix when new commits arrive on canon. Since
# it is not part of the depot build tree, this is a useful
# verification to ensure we don't break external things (too much).
- trigger: "tvix"
async: true
label: ":fork:"
branches: "refs/heads/canon"
build:
message: "Verification triggered by ${BUILDKITE_COMMIT}"
# Create a revision number for the current commit for builds on
# canon.
#
# This writes data back to Gerrit using the Buildkite agent
# credentials injected through a git credentials helper.
#
# Revision numbers are defined as the number of commits in the
# lineage of HEAD, following only the first parent of merges.
#
# Note that git does not fetch these refs by default, instead
# you'll have to modify your git config using
# `git config --add remote.origin.fetch '+refs/r/*:refs/r/*'`.
# The refs are available after the next `git fetch`.
- label: ":git:"
branches: "refs/heads/canon"
command: |
git -c 'credential.helper=gerrit-creds' \
push origin "HEAD:refs/r/$(git rev-list --count --first-parent HEAD)"
# Generate & upload dynamic build steps
- label: ":llama:"
key: "pipeline-gen"
concurrency_group: 'depot-nix-eval'
concurrency: 5 # much more than this and whitby will OOM
command: |
set -ue
if test -n "$${GERRIT_CHANGE_URL-}"; then
echo "This is a build of [cl/$$GERRIT_CHANGE_ID]($$GERRIT_CHANGE_URL) (at patchset #$$GERRIT_PATCHSET)" | \
buildkite-agent annotate --context cl-annotation
fi
# Attempt to fetch a target map from a parent commit on canon,
# except on builds of canon itself.
[ "${BUILDKITE_BRANCH}" != "refs/heads/canon" ] && \
nix/buildkite/fetch-parent-targets.sh
PIPELINE_ARGS=""
if [[ -f tmp/parent-target-map.json ]]; then
PIPELINE_ARGS="--arg parentTargetMap tmp/parent-target-map.json"
fi
nix-build --option restrict-eval true --include "depot=$${PWD}" \
--include "store=/nix/store" \
--allowed-uris 'https://' \
-A ops.pipelines.depot \
-o pipeline --show-trace $$PIPELINE_ARGS
# Steps need to be uploaded in reverse order because pipeline
# upload prepends instead of appending.
ls pipeline/build-chunk-*.json | tac | while read chunk; do
buildkite-agent pipeline upload $$chunk
done
buildkite-agent artifact upload "pipeline/*"
# Wait for all previous steps to complete.
- wait: null
continue_on_failure: true
# Exit with success or failure depending on whether any other steps
# failed (but not retried).
#
# This information is checked by querying the Buildkite GraphQL API
# and fetching all failed steps, then filtering out the ones that were
# retried (retried jobs create new jobs, which would also show up in the
# query).
#
# This step must be :duck: (yes, really!) because the post-command
# hook will inspect this name.
#
# Note that this step has requirements for the agent environment, which
# are enforced in our NixOS configuration:
#
# * curl and jq must be on the $PATH of build agents
# * besadii configuration must be readable to the build agents
- label: ":duck:"
key: ":duck:"
command: |
set -ueo pipefail
readonly FAILED_JOBS=$(curl 'https://graphql.buildkite.com/v1' \
--silent \
-H "Authorization: Bearer $(cat ${BUILDKITE_TOKEN_PATH})" \
-d "{\"query\": \"query BuildStatusQuery { build(uuid: \\\"$BUILDKITE_BUILD_ID\\\") { jobs(passed: false, first: 500 ) { edges { node { ... on JobTypeCommand { retried } } } } } }\"}" | \
jq -r '.data.build.jobs.edges | map(select(.node.retried == false)) | length')
echo "$$FAILED_JOBS build jobs failed."
if (( $$FAILED_JOBS > 0 )); then
exit 1
fi
# After duck, on success, upload and run any release steps that were
# output by the dynamic pipeline.
- label: ":arrow_heading_down:"
depends_on:
- step: ":duck:"
allow_failure: false
command: |
set -ueo pipefail
buildkite-agent artifact download "pipeline/*" .
find ./pipeline -name 'release-chunk-*.json' | tac | while read chunk; do
buildkite-agent pipeline upload $$chunk
done