refactor(tvix/build): add stricter BuildRequest type

Change-Id: I2950c76bbc2227952e583426bfb3ed34e8da6d2d
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12625
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
This commit is contained in:
Marijan Petričević 2024-10-14 15:04:31 -05:00
parent 1c1eb68678
commit a247b25097
3 changed files with 315 additions and 3 deletions

View file

@ -0,0 +1,131 @@
use std::collections::{BTreeMap, HashSet};
use std::path::PathBuf;
use bytes::Bytes;
use tvix_castore::{Node, PathComponent};
/// A BuildRequest describes the request of something to be run on the builder.
/// It is distinct from an actual \[Build\] that has already happened, or might be
/// currently ongoing.
///
/// A BuildRequest can be seen as a more normalized version of a Derivation
/// (parsed from A-Term), "writing out" some of the Nix-internal details about
/// how e.g. environment variables in the build are set.
///
/// Nix has some impurities when building a Derivation, for example the --cores option
/// ends up as an environment variable in the build, that's not part of the ATerm.
///
/// As of now, we serialize this into the BuildRequest, so builders can stay dumb.
/// This might change in the future.
///
/// There's also a big difference when it comes to how inputs are modelled:
///
/// * Nix only uses store path (strings) to describe the inputs.
/// As store paths can be input-addressed, a certain store path can contain
/// different contents (as not all store paths are binary reproducible).
/// This requires that for every input-addressed input, the builder has access
/// to either the input's deriver (and needs to build it) or else a trusted
/// source for the built input.
/// to upload input-addressed paths, requiring the trusted users concept.
/// * tvix-build records a list of tvix.castore.v1.Node as inputs.
/// These map from the store path base name to their contents, relieving the
/// builder from having to "trust" any input-addressed paths, contrary to Nix.
///
/// While this approach gives a better hermeticity, it has one downside:
/// A BuildRequest can only be sent once the contents of all its inputs are known.
///
/// As of now, we're okay to accept this, but it prevents uploading an
/// entirely-non-IFD subgraph of BuildRequests eagerly.
#[derive(Clone, PartialEq)]
pub struct BuildRequest {
/// The list of all root nodes that should be visible in `inputs_dir` at the
/// time of the build.
/// As all references are content-addressed, no additional signatures are
/// needed to substitute / make these available in the build environment.
pub inputs: BTreeMap<PathComponent, Node>,
/// The command (and its args) executed as the build script.
/// In the case of a Nix derivation, this is usually
/// \["/path/to/some-bash/bin/bash", "-e", "/path/to/some/builder.sh"\].
pub command_args: Vec<String>,
/// The working dir of the command, relative to the build root.
/// "build", in the case of Nix.
/// This MUST be a clean relative path, without any ".", "..", or superfluous
/// slashes.
pub working_dir: PathBuf,
/// A list of "scratch" paths, relative to the build root.
/// These will be write-able during the build.
/// \[build, nix/store\] in the case of Nix.
/// These MUST be clean relative paths, without any ".", "..", or superfluous
/// slashes, and sorted.
pub scratch_paths: Vec<PathBuf>,
/// The path where the castore input nodes will be located at,
/// "nix/store" in case of Nix.
/// Builds might also write into here (Nix builds do that).
/// This MUST be a clean relative path, without any ".", "..", or superfluous
/// slashes.
pub inputs_dir: PathBuf,
/// The list of output paths the build is expected to produce,
/// relative to the root.
/// If the path is not produced, the build is considered to have failed.
/// These MUST be clean relative paths, without any ".", "..", or superfluous
/// slashes, and sorted.
pub outputs: Vec<PathBuf>,
/// The list of environment variables and their values that should be set
/// inside the build environment.
/// This includes both environment vars set inside the derivation, as well as
/// more "ephemeral" ones like NIX_BUILD_CORES, controlled by the `--cores`
/// CLI option of `nix-build`.
/// For now, we consume this as an option when turning a Derivation into a BuildRequest,
/// similar to how Nix has a `--cores` option.
/// We don't want to bleed these very nix-specific sandbox impl details into
/// (dumber) builders if we don't have to.
/// Environment variables are sorted by their keys.
pub environment_vars: Vec<EnvVar>,
/// A set of constraints that need to be satisfied on a build host before a
/// Build can be started.
pub constraints: HashSet<BuildConstraints>,
/// Additional (small) files and their contents that should be placed into the
/// build environment, but outside inputs_dir.
/// Used for passAsFile and structuredAttrs in Nix.
pub additional_files: Vec<AdditionalFile>,
/// If this is an non-empty list, all paths in `outputs` are scanned for these.
/// For Nix, `refscan_needles` would be populated with the nixbase32 hash parts of
/// every input store path and output store path. The latter is necessary to scan
/// for references between multi-output derivations.
pub refscan_needles: Vec<String>,
}
#[derive(Clone, PartialEq)]
pub struct EnvVar {
/// name of the environment variable. Must not contain =.
pub key: String,
pub value: Bytes,
}
/// BuildConstraints represents certain conditions that must be fulfilled
/// inside the build environment to be able to build this.
/// Constraints can be things like required architecture and minimum amount of memory.
/// The required input paths are *not* represented in here, because it
/// wouldn't be hermetic enough - see the comment around inputs too.
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum BuildConstraints {
/// The system that's needed to execute the build.
/// Must not be empty.
System(String),
/// The amount of memory required to be available for the build, in bytes.
MinMemory(u64),
/// An absolute path that need to be available in the build
/// environment, like `/dev/kvm`.
/// This is distinct from the castore nodes in inputs.
/// These MUST be clean absolute paths, without any ".", "..", or superfluous
/// slashes, and sorted.
AvailableReadOnlyPath(PathBuf),
/// Whether the build should be able to access the network.
NetworkAccess,
/// Whether to provide a /bin/sh inside the build environment, usually a static bash.
ProvideBinSh,
}
#[derive(Clone, PartialEq)]
pub struct AdditionalFile {
pub path: PathBuf,
pub contents: Bytes,
}

View file

@ -1,7 +1,9 @@
use tonic::async_trait;
use crate::proto::{Build, BuildRequest};
use crate::proto::{self, Build};
pub mod build_request;
pub use crate::buildservice::build_request::*;
mod dummy;
mod from_addr;
mod grpc;
@ -15,5 +17,5 @@ pub use from_addr::from_addr;
#[async_trait]
pub trait BuildService: Send + Sync {
/// TODO: document
async fn do_build(&self, request: BuildRequest) -> std::io::Result<Build>;
async fn do_build(&self, request: proto::BuildRequest) -> std::io::Result<Build>;
}

View file

@ -1,7 +1,8 @@
use std::collections::{BTreeMap, HashSet};
use std::path::{Path, PathBuf};
use itertools::Itertools;
use tvix_castore::DirectoryError;
use tvix_castore::{DirectoryError, Node, PathComponent};
mod grpc_buildservice_wrapper;
@ -201,6 +202,101 @@ impl BuildRequest {
}
}
impl TryFrom<BuildRequest> for crate::buildservice::BuildRequest {
type Error = ValidateBuildRequestError;
fn try_from(value: BuildRequest) -> Result<Self, Self::Error> {
// validate input names. Make sure they're sorted
let mut last_name: bytes::Bytes = "".into();
let mut inputs: BTreeMap<PathComponent, Node> = BTreeMap::new();
for (i, node) in value.inputs.iter().enumerate() {
let (name, node) = node
.clone()
.into_name_and_node()
.map_err(|e| ValidateBuildRequestError::InvalidInputNode(i, e))?;
if name.as_ref() <= last_name.as_ref() {
return Err(ValidateBuildRequestError::InputNodesNotSorted);
} else {
inputs.insert(name.clone(), node);
last_name = name.into();
}
}
// validate working_dir
if !is_clean_relative_path(&value.working_dir) {
Err(ValidateBuildRequestError::InvalidWorkingDir)?;
}
// validate scratch paths
for (i, p) in value.scratch_paths.iter().enumerate() {
if !is_clean_relative_path(p) {
Err(ValidateBuildRequestError::InvalidScratchPath(i))?
}
}
if !is_sorted(value.scratch_paths.iter().map(|e| e.as_bytes())) {
Err(ValidateBuildRequestError::ScratchPathsNotSorted)?;
}
// validate inputs_dir
if !is_clean_relative_path(&value.inputs_dir) {
Err(ValidateBuildRequestError::InvalidInputsDir)?;
}
// validate outputs
for (i, p) in value.outputs.iter().enumerate() {
if !is_clean_relative_path(p) {
Err(ValidateBuildRequestError::InvalidOutputPath(i))?
}
}
if !is_sorted(value.outputs.iter().map(|e| e.as_bytes())) {
Err(ValidateBuildRequestError::OutputsNotSorted)?;
}
// validate environment_vars.
for (i, e) in value.environment_vars.iter().enumerate() {
if e.key.is_empty() || e.key.contains('=') {
Err(ValidateBuildRequestError::InvalidEnvVar(i))?
}
}
if !is_sorted(value.environment_vars.iter().map(|e| e.key.as_bytes())) {
Err(ValidateBuildRequestError::EnvVarNotSorted)?;
}
// validate build constraints
let constraints = value
.constraints
.map_or(Ok(HashSet::new()), |constraints| {
constraints
.try_into()
.map_err(ValidateBuildRequestError::InvalidBuildConstraints)
})?;
// validate additional_files
for (i, additional_file) in value.additional_files.iter().enumerate() {
if !is_clean_relative_path(&additional_file.path) {
Err(ValidateBuildRequestError::InvalidAdditionalFilePath(i))?
}
}
if !is_sorted(value.additional_files.iter().map(|e| e.path.as_bytes())) {
Err(ValidateBuildRequestError::AdditionalFilesNotSorted)?;
}
Ok(Self {
inputs,
command_args: value.command_args,
working_dir: PathBuf::from(value.working_dir),
scratch_paths: value.scratch_paths.iter().map(PathBuf::from).collect(),
inputs_dir: PathBuf::from(value.inputs_dir),
outputs: value.outputs.iter().map(PathBuf::from).collect(),
environment_vars: value.environment_vars.into_iter().map(Into::into).collect(),
constraints,
additional_files: value.additional_files.into_iter().map(Into::into).collect(),
refscan_needles: value.refscan_needles,
})
}
}
/// Errors that occur during the validation of
/// [build_request::BuildConstraints] messages.
#[derive(Debug, thiserror::Error)]
@ -235,7 +331,90 @@ impl build_request::BuildConstraints {
}
}
impl From<build_request::EnvVar> for crate::buildservice::EnvVar {
fn from(value: build_request::EnvVar) -> Self {
Self {
key: value.key,
value: value.value,
}
}
}
impl From<crate::buildservice::EnvVar> for build_request::EnvVar {
fn from(value: crate::buildservice::EnvVar) -> Self {
Self {
key: value.key,
value: value.value,
}
}
}
impl From<build_request::AdditionalFile> for crate::buildservice::AdditionalFile {
fn from(value: build_request::AdditionalFile) -> Self {
Self {
path: PathBuf::from(value.path),
contents: value.contents,
}
}
}
impl From<crate::buildservice::AdditionalFile> for build_request::AdditionalFile {
fn from(value: crate::buildservice::AdditionalFile) -> Self {
Self {
path: value
.path
.to_str()
.expect("Tvix bug: expected a valid path")
.to_string(),
contents: value.contents,
}
}
}
impl TryFrom<build_request::BuildConstraints> for HashSet<crate::buildservice::BuildConstraints> {
type Error = ValidateBuildConstraintsError;
fn try_from(value: build_request::BuildConstraints) -> Result<Self, Self::Error> {
use crate::buildservice::BuildConstraints;
// validate system
if value.system.is_empty() {
Err(ValidateBuildConstraintsError::InvalidSystem)?;
}
let mut build_constraints = HashSet::from([
BuildConstraints::System(value.system),
BuildConstraints::MinMemory(value.min_memory),
]);
// validate available_ro_paths
for (i, p) in value.available_ro_paths.iter().enumerate() {
if !is_clean_absolute_path(p) {
Err(ValidateBuildConstraintsError::InvalidAvailableRoPaths(i))?
} else {
build_constraints.insert(BuildConstraints::AvailableReadOnlyPath(PathBuf::from(p)));
}
}
if !is_sorted(value.available_ro_paths.iter().map(|e| e.as_bytes())) {
Err(ValidateBuildConstraintsError::AvailableRoPathsNotSorted)?;
}
if value.network_access {
build_constraints.insert(BuildConstraints::NetworkAccess);
}
if value.provide_bin_sh {
build_constraints.insert(BuildConstraints::ProvideBinSh);
}
Ok(build_constraints)
}
}
#[cfg(test)]
// TODO: add testcases for constraints special cases. The default cases in the protos
// should result in the constraints not being added. For example min_memory 0 can be omitted.
// Also interesting testcases are "merging semantics". MimMemory(1) and MinMemory(100) will
// result in mim_memory 100, multiple AvailableReadOnlyPaths need to be merged. Contradicting
// system constraints need to fail somewhere (maybe an assertion, as only buggy code can construct it)
mod tests {
use super::{is_clean_path, is_clean_relative_path};
use rstest::rstest;