feat(nix/dependency-analyzer): find deps among a list of known drvs

This was written with the same intention (and reuses a little of its
code) as cl/5060 and cl/5063: We want to be able to emit dependencies
between //nix/buildkite pipeline steps, so that no agent is occupied
with waiting on locks for derivations built by a different agent.

This dependency information is already available to the Nix store
implementation (e.g. via `nix-store --query --references`) and can also
be obtained in the Nix language which is important, since the pipeline
is generated at evaluation time. (Note: For Nix 2.3, you either need a
strong convention about how derivations expose their dependencies (which
we don't) or rely on store implementation internals (drv files).
For Nix 2.6 there is a better trick, but it also relies on the existence
of drv files.)

The actual task can be formulated as follows: Given a set of
derivations, calculate the the closest derivations also in the input
each derivation depends on. (We call these (next) known dependencies.)
This is crucial because pipeline step often depend on each other only
indirectly with any number of intermediate derivations. For cl/5064 I
determined that 6 intermediate layers is quite common for dependencies
that are perceived to be “direct”.

This problem is solved as follows:

1. Calculate the dependency graph of the combined dependency closure of
   all input derivations. This is quite easy and fairly quick thanks to
   the C++ implementation of builtins.genericClosure. One weak point of
   the current implementation is that the function to determine the
   direct derivation dependencies for Nix < 2.6 is quite hacky.

2. Take the graph from 1. and calculate a dependency graph that only
   connects the known derivations of the input, but retains all
   connections between them (minus intermediate nodes).

In practice the dependency graph is represented as an attribute set
mapping derivation paths to a list of derivation paths it depends on.
The second step is performed by adding a second list of known derivation
paths it depends on.

The main improvements over the previous concept (cl/5060 and cl/5063):

* We only try to find the closest known dependencies in the dependency
  graph whereas we would traverse emit dependencies for the entire
  dependency closure.

* We immediately store the calculation of the closest known dependency
  in the dependency graph, even for intermediate nodes. This avoids
  recalculating the connection (which was a big drawback of the previous
  approach) and makes the calculation itself cheaper.

You can run `mg build //nix/dependency-analyzer:example` to build a
visualization of the internal dependencies between `depot.ci.targets` as
discovered by dependency-analyzer.

Change-Id: If8c0cdfc8470d4b337336257d9818aaa0d51110f
Reviewed-on: https://cl.tvl.fyi/c/depot/+/6832
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
sterni 2022-09-26 23:46:52 +02:00
parent ca3bd5c7ca
commit 57d5988b34
4 changed files with 307 additions and 0 deletions

View file

@ -0,0 +1,254 @@
{ lib, depot, pkgs, ... }:
let
inherit (builtins) unsafeDiscardStringContext appendContext;
#
# Utilities
#
# Manipulate string context of the given string so that it only carries a
# `path` reference to itself (so it needs to be a string representation of
# a store path).
#
# This is intended for use on the `drvPath` attribute of derivations which by
# default carries a reference to the corresponding outputs. If we only want to
# read from the `drvPath`, having only a `path` reference makes sure we don't
# need to realise the derivation first.
#
# Type: str -> str
pathContextDrvPath = drvPath:
let
drvPath' = unsafeDiscardStringContext drvPath;
in
appendContext drvPath' { ${drvPath'} = { path = true; }; };
# Find all quoted references to a derivation path in the specified drv file.
# Should correspond to the list of input derivations, but is obviously a big
# HACK as we just grep for store paths that look right. This should eventually
# be solved properly by parsing the drv file.
#
# Type: str -> [str]
directDrvDeps = drvPath: builtins.concatLists (
builtins.filter builtins.isList (
builtins.split
"\"(${lib.escapeRegex builtins.storeDir}/[[:alnum:]+._?=-]+.drv)\""
(builtins.readFile drvPath)
)
);
# Maps a list of derivation to the list of corresponding `drvPath`s.
#
# Type: [drv] -> [str]
drvsToPaths = drvs:
builtins.map (drv: pathContextDrvPath drv.drvPath) drvs;
#
# Calculate map of direct derivation dependencies
#
# Create the dependency map entry for a given `drvPath` which mainly includes
# a list of other `drvPath`s it depends on. Additionally we store whether the
# derivation is `known`, i.e. part of the initial list of derivations we start
# generating the map from
#
# Type: bool -> string -> set
drvEntry = known: drvPath:
let
# key may not refer to a store path, …
key = unsafeDiscardStringContext drvPath;
# but we must read from the .drv file.
path = pathContextDrvPath drvPath;
in
{
inherit key;
# trick so we can call listToAttrs directly on the result of genericClosure
name = key;
value = {
deps = directDrvDeps path;
inherit known;
};
};
# Create an attribute set that maps every derivation in the combined
# dependency closure of the list of input derivation paths to every of their
# direct dependencies. Additionally every entry will have set their `known`
# attribute to `true` if it is in the list of input derivation paths.
#
# Type: [str] -> set
plainDrvDepMap = drvPaths:
builtins.listToAttrs (
builtins.genericClosure {
startSet = builtins.map (drvEntry true) drvPaths;
operator = { value, ... }: builtins.map (drvEntry false) value.deps;
}
);
#
# Calculate closest known dependencies in the dependency map
#
inherit (depot.nix.stateMonad)
after
bind
for_
get
getAttr
run
setAttr
pure
;
# This is an action in stateMonad which expects the (initial) state to have
# been produced by `plainDrvDepMap`. Given a `drvPath`, it calculates a
# `knownDeps` list which holds the `drvPath`s of the closest derivation marked
# as `known` along every edge. This list is inserted into the dependency map
# for `drvPath` and every other derivation in its dependecy closure (unless
# the information was already present). This means that the known dependency
# information for a derivation never has to be recalculated, as long as they
# are part of the same stateful computation.
#
# The upshot is that after calling `insertKnownDeps drvPath`,
# `fmap (builtins.getAttr "knownDeps") (getAttr drvPath)` will always succeed.
#
# Type: str -> stateMonad drvDepMap null
insertKnownDeps = drvPathWithContext:
let
# We no longer need to read from the store, so context is irrelevant, but
# we need to check for attr names which requires the absence of context.
drvPath = unsafeDiscardStringContext drvPathWithContext;
in
bind get (initDepMap:
# Get the dependency map's state before we've done anything to obtain the
# entry we'll be manipulating later as well as its dependencies.
let
entryPoint = initDepMap.${drvPath};
# We don't need to recurse if our direct dependencies either have their
# knownDeps list already populated or are known dependencies themselves.
depsPrecalculated =
builtins.partition
(dep:
initDepMap.${dep}.known
|| initDepMap.${dep} ? knownDeps
)
entryPoint.deps;
# If a direct dependency is known, it goes right to our known dependency
# list. If it is unknown, we can copy its knownDeps list into our own.
initiallyKnownDeps =
builtins.concatLists (
builtins.map
(dep:
if initDepMap.${dep}.known
then [ dep ]
else initDepMap.${dep}.knownDeps
)
depsPrecalculated.right
);
in
# If the information was already calculated before, we can exit right away
if entryPoint ? knownDeps
then pure null
else
after
# For all unknown direct dependencies which don't have a `knownDeps`
# list, we call ourselves recursively to populate it. Since this is
# done sequentially in the state monad, we avoid recalculating the
# list for the same derivation multiple times.
(for_
depsPrecalculated.wrong
insertKnownDeps)
# After this we can obtain the updated dependency map which will have
# a `knownDeps` list for all our direct dependencies and update the
# entry for the input `drvPath`.
(bind
get
(populatedDepMap:
(setAttr drvPath (entryPoint // {
knownDeps =
lib.unique (
initiallyKnownDeps
++ builtins.concatLists (
builtins.map
(dep: populatedDepMap.${dep}.knownDeps)
depsPrecalculated.wrong
)
);
}))))
);
# This function puts it all together and is exposed via `__functor`.
#
# For a list of `drvPath`s, calculate an attribute set which maps every
# `drvPath` to a set of the following form:
#
# {
# known = true /* if it is in the list of input derivation paths */;
# deps = [
# /* list of derivation paths it depends on directly */
# ];
# knownDeps = [
# /* list of the closest derivation paths marked as known this
# derivation depends on.
# */
# ];
# }
knownDrvDepMap = knownDrvPaths:
run
(plainDrvDepMap knownDrvPaths)
(after
(for_
knownDrvPaths
insertKnownDeps)
get);
#
# Other things based on knownDrvDepMap
#
# Create a SVG visualizing `knownDrvDepMap`. Nodes are identified by derivation
# name, so multiple entries can be collapsed if they have the same name.
#
# Type: [drv] -> drv
knownDependencyGraph = name: drvs:
let
justName = drvPath:
builtins.substring
(builtins.stringLength builtins.storeDir + 1 + 32 + 1)
(builtins.stringLength drvPath)
(unsafeDiscardStringContext drvPath);
gv = pkgs.writeText "${name}-dependency-analysis.gv" ''
digraph depot {
${
(lib.concatStringsSep "\n"
(lib.mapAttrsToList (name: value:
if !value.known then ""
else lib.concatMapStringsSep "\n"
(knownDep: " \"${justName name}\" -> \"${justName knownDep}\"")
value.knownDeps
)
(depot.nix.dependency-analyzer (
drvsToPaths drvs
))))
}
}
'';
in
pkgs.runCommand "${name}-dependency-analysis.svg"
{
nativeBuildInputs = [
pkgs.buildPackages.graphviz
];
}
"dot -Tsvg < ${gv} > $out";
in
{
__functor = _: knownDrvDepMap;
inherit knownDependencyGraph plainDrvDepMap drvsToPaths;
}

View file

@ -0,0 +1,12 @@
{ depot, lib, ... }:
(
depot.nix.dependency-analyzer.knownDependencyGraph
"depot"
depot.ci.targets
).overrideAttrs (old: {
# Causes an infinite recursion via ci.targets otherwise
meta = lib.recursiveUpdate (old.meta or { }) {
ci.skip = true;
};
})

View file

@ -0,0 +1,5 @@
{ depot, lib, ... }:
depot.nix.dependency-analyzer.knownDependencyGraph "3p-lisp" (
builtins.filter lib.isDerivation (builtins.attrValues depot.third_party.lisp)
)

View file

@ -0,0 +1,36 @@
{ depot, lib, ... }:
let
inherit (depot.nix.runTestsuite)
runTestsuite
assertEq
it
;
inherit (depot.nix.dependency-analyzer)
plainDrvDepMap
drvsToPaths
;
knownDrvs = drvsToPaths (
builtins.filter lib.isDerivation (builtins.attrValues depot.third_party.lisp)
);
exampleMap = plainDrvDepMap knownDrvs;
# These will be needed to index into the attribute set which can't have context
# in the attribute names.
knownDrvsNoContext = builtins.map builtins.unsafeDiscardStringContext knownDrvs;
in
runTestsuite "dependency-analyzer" [
(it "checks plainDrvDepMap properties" [
(assertEq "all known drvs are marked known"
(builtins.all (drv: exampleMap.${drv}.known) knownDrvsNoContext)
true)
(assertEq "no unknown drv is marked known"
(builtins.all (entry: !entry.known) (
builtins.attrValues (builtins.removeAttrs exampleMap knownDrvsNoContext)
))
true)
])
]