Support building on target nodes

This partially addresses #33, and allows Colmena to be used more
easily on bandwidth-constrained hosts and macOS.

With `deployment.buildOnTarget = true;` deployment works fine from
macOS without designated builders, except when IFD is involved.
This commit is contained in:
Zhaofeng Li 2022-01-01 16:41:35 -08:00
parent 872f944743
commit 98897bf4de
15 changed files with 326 additions and 47 deletions

View file

@ -0,0 +1,35 @@
{ pkgs ? import ../nixpkgs.nix }:
let
tools = pkgs.callPackage ../tools.nix {
deployers = [ "deployer" "alpha" "beta" ];
targets = [];
};
in tools.makeTest {
name = "colmena-build-on-target";
bundle = ./.;
testScript = ''
# The actual build will be initiated on alpha
deployer.succeed("cd /tmp/bundle && ${tools.colmenaExec} apply --on alpha")
with subtest("Check that the new configurations are indeed applied"):
alpha.succeed("grep SUCCESS /etc/deployment")
alpha_profile = alpha.succeed("readlink /run/current-system")
with subtest("Check that the built profile is not on the deployer"):
deployer.fail(f"nix-store -qR {alpha_profile}")
with subtest("Check that we can override per-node settings and build locally"):
deployer.succeed("cd /tmp/bundle && ${tools.colmenaExec} build --on alpha --no-build-on-target")
deployer.succeed(f"nix-store -qR {alpha_profile}")
with subtest("Check that we can override per-node settings and build remotely"):
deployer.succeed("cd /tmp/bundle && ${tools.colmenaExec} apply --on beta --build-on-target")
beta.succeed("grep SUCCESS /etc/deployment")
profile = beta.succeed("readlink /run/current-system")
deployer.fail(f"nix-store -qR {profile}")
'';
}

View file

@ -0,0 +1,33 @@
let
tools = import ./tools.nix {
insideVm = true;
deployers = [ "deployer" "alpha" "beta" ];
targets = [];
};
in {
meta = {
nixpkgs = tools.pkgs;
};
defaults = {
environment.etc."deployment".text = "SUCCESS";
};
deployer = tools.getStandaloneConfigFor "deployer";
alpha = {
imports = [
(tools.getStandaloneConfigFor "alpha")
];
deployment.buildOnTarget = true;
};
beta = {
imports = [
(tools.getStandaloneConfigFor "beta")
];
deployment.buildOnTarget = false;
};
}

View file

@ -1,6 +1,7 @@
{
apply = import ./apply {};
apply-local = import ./apply-local {};
build-on-target = import ./build-on-target {};
exec = import ./exec {};
flakes = import ./flakes {};
parallel = import ./parallel {};

View file

@ -55,6 +55,11 @@ let
];
};
services.openssh.enable = true;
users.users.root.openssh.authorizedKeys.keys = [
sshKeys.snakeOilPublicKey
];
environment.systemPackages = with pkgs; [
git # for git flake tests

View file

@ -10,6 +10,7 @@
- [Secrets](./features/keys.md)
- [Ad Hoc Evaluation](./features/eval.md)
- [Parallelism](./features/parallelism.md)
- [Remote Builds](./features/remote-builds.md)
- [Examples](./examples/index.md)
- [Multi-Architecture Deployments](./examples/multi-arch.md)
- [Reference](./reference/index.md)

View file

@ -7,3 +7,4 @@ This section introduces the main features in Colmena:
- **[Secrets](keys.md)** - Deploying sensitive files separate from the main configuration
- **[Ad Hoc Evaluation](eval.md)** - Evaluating a Nix expression with access to your configuration
- **[Parallelism](parallelism.md)** - Controlling how Colmena parallelizes the deployment process
- **[Remote Builds](remote-builds.md)** - Building system profiles on remote machines

View file

@ -0,0 +1,21 @@
# Remote Builds
If the host running Colmena is not powerful enough, consider offloading the actual builds to remote machines.
Colmena supports two ways to achieve this:
## Using Colmena's `deployment.buildOnTarget`
If you set [`deployment.buildOnTarget = true;`](../reference/deployment.md#deploymentbuildontarget) for a node, then the actual build process will be initiated on the node itself.
Colmena will evaluate the configuration locally before copying the derivations to the target node.
You can temporarily enable this for all nodes by passing `--build-on-target` on the command line, or disable it with `--no-build-on-target`.
This is most useful in scenarios where the machine running Colmena is bandwidth-constrained, or it's inconvenient to configure designated builders beforehand.
With this method, the build results will _not_ be copied back to the local machine or otherwise shared across the target nodes.
If you have custom packages used on multiple nodes, the work required to build those packages will be duplicated across the nodes.
## Using the native distributed build feature in Nix
When [distributed build](https://nixos.org/manual/nix/unstable/advanced-topics/distributed-builds.html) is enabled, Nix will transparently forward builds to the configured builders.
After the builds are done, Nix will copy the results back to the local machine.
Builders can either be configured globally or in your configuration with [`meta.machinesFile`](../reference/meta.md#machinesFile).

View file

@ -1,7 +1,7 @@
use std::env;
use std::path::PathBuf;
use clap::{Arg, App, SubCommand, ArgMatches};
use clap::{Arg, App, SubCommand, ArgMatches, ArgSettings};
use crate::nix::deployment::{
Deployment,
@ -89,6 +89,20 @@ To upload keys without building or deploying the rest of the configuration, use
.help("Do not use gzip")
.long_help("Disables the use of gzip when copying closures to the remote host.")
.takes_value(false))
.arg(Arg::with_name("build-on-target")
.long("build-on-target")
.help("Build the system profiles on the target nodes")
.long_help(r#"Build the system profiles on the target nodes themselves.
If enabled, the system profiles will be built on the target nodes themselves, not on the host running Colmena itself.
This overrides per-node perferences set in `deployment.buildOnTarget`.
To temporarily disable remote build on all nodes, use `--no-build-on-target`.
"#)
.takes_value(false))
.arg(Arg::with_name("no-build-on-target")
.long("no-build-on-target")
.set(ArgSettings::Hidden)
.takes_value(false))
.arg(Arg::with_name("force-replace-unknown-profiles")
.long("force-replace-unknown-profiles")
.help("Ignore all targeted nodes deployment.replaceUnknownProfiles setting")
@ -146,6 +160,12 @@ pub async fn run(_global_args: &ArgMatches<'_>, local_args: &ArgMatches<'_>) ->
options.set_create_gc_roots(true);
}
if local_args.is_present("no-build-on-target") {
options.set_force_build_on_target(false);
} else if local_args.is_present("build-on-target") {
options.set_force_build_on_target(true);
}
options
};

View file

@ -31,6 +31,7 @@ use super::{
Profile,
ProfileDerivation,
CopyDirection,
CopyOptions,
key::{Key, UploadAt as UploadKeyAt},
};
use super::host;
@ -229,8 +230,29 @@ impl Deployment {
let mut futures = Vec::new();
for (name, profile_drv) in profile_drvs.iter() {
let target = chunk.remove(name).unwrap();
futures.push(self.clone().deploy_node(parent.clone(), target, profile_drv.clone()));
let mut target = chunk.remove(name).unwrap();
if let Some(force_build_on_target) = self.options.force_build_on_target {
target.config.set_build_on_target(force_build_on_target);
}
let job_handle = parent.clone();
let arc_self = self.clone();
futures.push(async move {
let (target, profile) = {
if target.config.build_on_target() {
arc_self.clone().build_on_node(job_handle.clone(), target, profile_drv.clone()).await?
} else {
arc_self.clone().build_and_push_node(job_handle.clone(), target, profile_drv.clone()).await?
}
};
if arc_self.goal.requires_activation() {
arc_self.activate_node(job_handle, target, profile).await
} else {
Ok(())
}
});
}
join_all(futures).await
@ -273,14 +295,45 @@ impl Deployment {
}).await
}
/// Builds, pushes, and optionally activates a system profile on a node.
///
/// This will also upload keys to the node.
async fn deploy_node(self: DeploymentHandle, parent: JobHandle, mut target: TargetNode, profile_drv: ProfileDerivation)
-> NixResult<()>
/// Builds a system profile directly on the node itself.
async fn build_on_node(self: DeploymentHandle, parent: JobHandle, mut target: TargetNode, profile_drv: ProfileDerivation)
-> NixResult<(TargetNode, Profile)>
{
let nodes = vec![target.name.clone()];
let permit = self.parallelism_limit.apply.acquire().await.unwrap();
let build_job = parent.create_job(JobType::Build, nodes.clone())?;
let (target, profile) = build_job.run(|job| async move {
if target.host.is_none() {
return Err(NixError::Unsupported);
}
let mut host = target.host.as_mut().unwrap();
host.set_job(Some(job.clone()));
host.copy_closure(
profile_drv.as_store_path(),
CopyDirection::ToRemote,
CopyOptions::default().include_outputs(true),
).await?;
let profile = profile_drv.realize_remote(&mut host).await?;
job.success_with_message(format!("Built {:?} on target node", profile.as_path()))?;
Ok((target, profile))
}).await?;
drop(permit);
Ok((target, profile))
}
/// Builds and pushes a system profile on a node.
async fn build_and_push_node(self: DeploymentHandle, parent: JobHandle, mut target: TargetNode, profile_drv: ProfileDerivation)
-> NixResult<(TargetNode, Profile)>
{
let nodes = vec![target.name.clone()];
let target_name = target.name.clone();
let permit = self.parallelism_limit.apply.acquire().await.unwrap();
@ -292,21 +345,21 @@ impl Deployment {
let mut builder = host::local(arc_self.nix_options.clone());
builder.set_job(Some(job.clone()));
let profile = profile_drv.realize(&mut *builder).await?;
let profile = profile_drv.realize(&mut builder).await?;
job.success_with_message(format!("Built {:?}", profile.as_path()))?;
Ok(profile)
}).await?;
if self.goal == Goal::Build {
return Ok(());
return Ok((target, profile));
}
// Push closure to remote
let push_job = parent.create_job(JobType::Push, nodes.clone())?;
let push_profile = profile.clone();
let arc_self = self.clone();
let mut target = push_job.run(|job| async move {
let target = push_job.run(|job| async move {
if target.host.is_none() {
return Err(NixError::Unsupported);
}
@ -321,11 +374,22 @@ impl Deployment {
Ok(target)
}).await?;
if !self.goal.requires_activation() {
// We are done here :)
return Ok(());
drop(permit);
Ok((target, profile))
}
/// Activates a system profile on a node.
///
/// This will also upload keys to the node.
async fn activate_node(self: DeploymentHandle, parent: JobHandle, mut target: TargetNode, profile: Profile)
-> NixResult<()>
{
let nodes = vec![target.name.clone()];
let target_name = target.name.clone();
let permit = self.parallelism_limit.apply.acquire().await.unwrap();
// Upload pre-activation keys
let mut target = if self.options.upload_keys {
let job = parent.create_job(JobType::UploadKeys, nodes.clone())?;
@ -386,7 +450,7 @@ impl Deployment {
}).await?;
// Upload post-activation keys
if self.options.upload_keys {
let target = if self.options.upload_keys {
let job = parent.create_job(JobType::UploadKeys, nodes.clone())?;
job.run_waiting(|job| async move {
let keys = target.config.keys.iter()
@ -396,7 +460,7 @@ impl Deployment {
if keys.is_empty() {
job.noop("No post-activation keys to upload".to_string())?;
return Ok(());
return Ok(target);
}
job.state(JobState::Running)?;
@ -407,15 +471,21 @@ impl Deployment {
host.upload_keys(&keys, true).await?;
job.success_with_message("Uploaded keys (post-activation)".to_string())?;
Ok(())
}).await?;
}
Ok(target)
}).await?
} else {
target
};
// Create GC root
if self.options.create_gc_roots {
let job = parent.create_job(JobType::CreateGcRoots, nodes.clone())?;
let arc_self = self.clone();
job.run_waiting(|job| async move {
if target.config.build_on_target() {
job.noop("The system profile was built on target node itself".to_string())?;
}
if let Some(dir) = arc_self.hive.context_dir() {
job.state(JobState::Running)?;
let path = dir.join(".gcroots").join(format!("node-{}", &*target_name));

View file

@ -20,6 +20,9 @@ pub struct Options {
/// directory if it exists.
pub(super) create_gc_roots: bool,
/// Whether to override per-node setting to build on the nodes themselves.
pub(super) force_build_on_target: Option<bool>,
/// Ignore the node-level `deployment.replaceUnknownProfiles` option.
pub(super) force_replace_unknown_profiles: bool,
}
@ -41,6 +44,10 @@ impl Options {
self.create_gc_roots = enable;
}
pub fn set_force_build_on_target(&mut self, enable: bool) {
self.force_build_on_target = Some(enable);
}
pub fn set_force_replace_unknown_profiles(&mut self, enable: bool) {
self.force_replace_unknown_profiles = enable;
}
@ -61,6 +68,7 @@ impl Default for Options {
gzip: true,
upload_keys: true,
create_gc_roots: false,
force_build_on_target: None,
force_replace_unknown_profiles: false,
}
}

View file

@ -67,6 +67,11 @@ let
See https://nixos.org/manual/nix/stable/#chap-distributed-builds
for the machine specification format.
This option is ignored when builds are initiated on the remote nodes
themselves via `deployment.buildOnTarget` or `--build-on-target`. To
still use the Nix distributed build functionality, configure the
builders on the target nodes with `nix.buildMachines`.
'';
default = null;
apply = value: if value == null then null else toString value;
@ -138,6 +143,26 @@ let
type = types.bool;
default = false;
};
buildOnTarget = lib.mkOption {
description = ''
Whether to build the system profiles on the target node itself.
When enabled, Colmena will copy the derivation to the target
node and initiate the build there. This avoids copying back the
build results involved with the native distributed build
feature. Furthermore, the `build` goal will be equivalent to
the `push` goal. Since builds happen on the target node, the
results are automatically "pushed" and won't exist in the local
Nix store.
You can temporarily override per-node settings by passing
`--build-on-target` (enable for all nodes) or
`--no-build-on-target` (disable for all nodes) on the command
line.
'';
type = types.bool;
default = false;
};
tags = lib.mkOption {
description = ''
A list of tags for the node.

View file

@ -40,18 +40,17 @@ impl Host for Ssh {
self.run_command(command).await
}
async fn realize_remote(&mut self, derivation: &StorePath) -> NixResult<Vec<StorePath>> {
// FIXME
let paths = self.ssh(&["nix-store", "--no-gc-warning", "--realise", derivation.as_path().to_str().unwrap()])
.capture_output()
.await;
let command = self.ssh(&["nix-store", "--no-gc-warning", "--realise", derivation.as_path().to_str().unwrap()]);
let mut execution = CommandExecution::new(command);
execution.set_job(self.job.clone());
let paths = execution
.capture_output()
.await?;
match paths {
Ok(paths) => {
paths.lines().map(|p| p.to_string().try_into()).collect()
}
Err(e) => Err(e),
}
}
async fn upload_keys(&mut self, keys: &HashMap<String, Key>, require_ownership: bool) -> NixResult<()> {
for (name, key) in keys {
self.upload_key(name, key, require_ownership).await?;

View file

@ -24,7 +24,7 @@ pub mod hive;
pub use hive::{Hive, HivePath};
pub mod store;
pub use store::{StorePath, StoreDerivation};
pub use store::{StorePath, StoreDerivation, BuildResult};
pub mod key;
pub use key::Key;
@ -158,6 +158,10 @@ pub struct NodeConfig {
#[serde(rename = "allowLocalDeployment")]
allow_local_deployment: bool,
#[serde(rename = "buildOnTarget")]
build_on_target: bool,
tags: Vec<String>,
#[serde(rename = "replaceUnknownProfiles")]
@ -223,6 +227,11 @@ impl NodeConfig {
pub fn tags(&self) -> &[String] { &self.tags }
pub fn allows_local_deployment(&self) -> bool { self.allow_local_deployment }
pub fn build_on_target(&self) -> bool { self.build_on_target }
pub fn set_build_on_target(&mut self, enable: bool) {
self.build_on_target = enable;
}
pub fn to_ssh_host(&self) -> Option<Ssh> {
self.target_host.as_ref().map(|target_host| {
let username =

View file

@ -10,6 +10,7 @@ use super::{
NixError,
StorePath,
StoreDerivation,
BuildResult,
};
pub type ProfileDerivation = StoreDerivation<Profile>;
@ -75,12 +76,18 @@ impl Profile {
Ok(())
}
fn from_store_path_unchecked(path: StorePath) -> Self {
Self(path)
}
}
impl TryFrom<Vec<StorePath>> for Profile {
impl TryFrom<BuildResult<Profile>> for Profile {
type Error = NixError;
fn try_from(paths: Vec<StorePath>) -> NixResult<Self> {
fn try_from(result: BuildResult<Self>) -> NixResult<Self> {
let paths = result.paths();
if paths.is_empty() {
return Err(NixError::BadOutput {
output: String::from("There is no store path"),
@ -93,7 +100,9 @@ impl TryFrom<Vec<StorePath>> for Profile {
});
}
let path = paths.into_iter().next().unwrap();
Self::from_store_path(path)
let path = paths.into_iter().next()
.unwrap().to_owned();
Ok(Self::from_store_path_unchecked(path))
}
}

View file

@ -13,6 +13,19 @@ use super::{Host, NixCommand, NixResult, NixError};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct StorePath(PathBuf);
/// A store derivation (.drv) that will result in a T when built.
#[derive(Debug)]
pub struct StoreDerivation<T: TryFrom<BuildResult<T>>>{
path: StorePath,
_target: PhantomData<T>,
}
/// Results of a build/realization.
pub struct BuildResult<T: TryFrom<BuildResult<T>>> {
results: Vec<StorePath>,
_derivation: PhantomData<T>,
}
impl StorePath {
/// Returns the raw store path.
pub fn as_path(&self) -> &Path {
@ -41,7 +54,7 @@ impl StorePath {
}
/// Converts the store path into a store derivation.
pub fn into_derivation<T: TryFrom<Vec<StorePath>>>(self) -> NixResult<StoreDerivation<T>> {
pub fn into_derivation<T: TryFrom<BuildResult<T>>>(self) -> NixResult<StoreDerivation<T>> {
if self.is_derivation() {
Ok(StoreDerivation::<T>::from_store_path_unchecked(self))
} else {
@ -76,14 +89,21 @@ impl From<StorePath> for PathBuf {
}
}
/// A store derivation (.drv) that will result in a T when built.
#[derive(Debug, Clone)]
pub struct StoreDerivation<T: TryFrom<Vec<StorePath>>>{
path: StorePath,
_target: PhantomData<T>,
impl<T: TryFrom<BuildResult<T>>> Clone for StoreDerivation<T> {
fn clone(&self) -> Self {
Self {
path: self.path.clone(),
_target: PhantomData,
}
}
}
impl<T: TryFrom<BuildResult<T>>> StoreDerivation<T> {
/// Returns the store path.
pub fn as_store_path(&self) -> &StorePath {
&self.path
}
impl<T: TryFrom<Vec<StorePath>>> StoreDerivation<T> {
fn from_store_path_unchecked(path: StorePath) -> Self {
Self {
path,
@ -92,16 +112,38 @@ impl<T: TryFrom<Vec<StorePath>>> StoreDerivation<T> {
}
}
impl<T: TryFrom<Vec<StorePath>, Error=NixError>> StoreDerivation<T> {
impl<T: TryFrom<BuildResult<T>, Error=NixError>> StoreDerivation<T> {
/// Builds the store derivation on a host, resulting in a T.
pub async fn realize(&self, host: &mut dyn Host) -> NixResult<T> {
pub async fn realize(&self, host: &mut Box<dyn Host>) -> NixResult<T> {
let paths: Vec<StorePath> = host.realize(&self.path).await?;
paths.try_into()
let result = BuildResult {
results: paths,
_derivation: PhantomData,
};
result.try_into()
}
/// Builds the store derivation on a host without copying the results back.
pub async fn realize_remote(&self, host: &mut Box<dyn Host>) -> NixResult<T> {
let paths: Vec<StorePath> = host.realize_remote(&self.path).await?;
let result = BuildResult {
results: paths,
_derivation: PhantomData,
};
result.try_into()
}
}
impl<T: TryFrom<Vec<StorePath>>> fmt::Display for StoreDerivation<T> {
impl<T: TryFrom<BuildResult<T>>> fmt::Display for StoreDerivation<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:?}", self.path)
}
}
impl<T: TryFrom<BuildResult<T>, Error=NixError>> BuildResult<T> {
pub fn paths(&self) -> &[StorePath] {
self.results.as_slice()
}
}