fix(tvix): Avoid buffering file into memory in builtins.hashFile

Right now `builtins.hashFile` always reads the entire file into memory
before hashing, which is not ideal for large files. This replaces
`read_to_string` with `open_file` which allows calculating the hash of
the file without buffering it entirely into memory. Other callers can
continue to buffer into memory if they choose, but they still use the
`open_file` VM request and then call `read_to_string` or `read_to_end`
on the `std::io::Reader`.

Fixes b/380

Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
This commit is contained in:
Connor Brewster 2024-03-22 18:52:21 -05:00
parent 17849c5c00
commit 63116d8c21
9 changed files with 80 additions and 74 deletions

View file

@ -177,9 +177,9 @@ mod import_builtins {
})
.transpose()?;
// FUTUREWORK(performance): this reads the file instead of using a stat-like
// system call to the file, this degrades very badly on large files.
if !recursive_ingestion && state.read_to_end(path.as_ref()).is_err() {
// FUTUREWORK(performance): this opens the file instead of using a stat-like
// system call to the file.
if !recursive_ingestion && state.open(path.as_ref()).is_err() {
Err(ImportError::FlatImportOfNonFile(
path.to_string_lossy().to_string(),
))?;

View file

@ -8,7 +8,7 @@
//! otherwise fundamental features like nixpkgs bootstrapping and hash
//! calculation will not work.
use std::io;
use std::io::{self, Cursor};
use std::path::{Path, PathBuf};
use tvix_eval::{EvalIO, FileType};
@ -44,7 +44,7 @@ where
self.actual.as_ref().path_exists(path)
}
fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
// Bundled version of corepkgs/fetchurl.nix. The counterpart
// of this happens in [crate::configure_nix_path], where the `nix_path`
// of the evaluation has `nix=/__corepkgs__` added to it.
@ -52,13 +52,12 @@ where
// This workaround is similar to what cppnix does for passing
// the path through.
//
// TODO: this comparison is bad and allocates, we should use
// the sane path library.
// TODO: this comparison is bad we should use the sane path library.
if path.starts_with("/__corepkgs__/fetchurl.nix") {
return Ok(include_bytes!("fetchurl.nix").to_vec());
return Ok(Box::new(Cursor::new(include_bytes!("fetchurl.nix"))));
}
self.actual.as_ref().read_to_end(path)
self.actual.as_ref().open(path)
}
fn read_dir(&self, path: &Path) -> io::Result<Vec<(bytes::Bytes, FileType)>> {

View file

@ -17,7 +17,7 @@ use std::{
path::{Path, PathBuf},
sync::Arc,
};
use tokio::io::AsyncReadExt;
use tokio_util::io::SyncIoBridge;
use tracing::{error, instrument, warn, Level};
use tvix_build::buildservice::BuildService;
use tvix_eval::{ErrorKind, EvalIO, FileType, StdIO};
@ -478,7 +478,7 @@ impl EvalIO for TvixStoreIO {
}
#[instrument(skip(self), err)]
fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
if let Ok((store_path, sub_path)) =
StorePath::from_absolute_path_full(&path.to_string_lossy())
{
@ -509,27 +509,24 @@ impl EvalIO for TvixStoreIO {
})?;
self.tokio_handle.block_on(async {
let mut reader = {
let resp = self.blob_service.as_ref().open_read(&digest).await?;
match resp {
Some(blob_reader) => blob_reader,
None => {
error!(
blob.digest = %digest,
"blob not found",
);
Err(io::Error::new(
io::ErrorKind::NotFound,
format!("blob {} not found", &digest),
))?
}
let resp = self.blob_service.as_ref().open_read(&digest).await?;
match resp {
Some(blob_reader) => {
// The VM Response needs a sync [std::io::Reader].
Ok(Box::new(SyncIoBridge::new(blob_reader))
as Box<dyn io::Read>)
}
};
let mut buf = Vec::new();
reader.read_to_end(&mut buf).await?;
Ok(buf)
None => {
error!(
blob.digest = %digest,
"blob not found",
);
Err(io::Error::new(
io::ErrorKind::NotFound,
format!("blob {} not found", &digest),
))
}
}
})
}
Node::Symlink(_symlink_node) => Err(io::Error::new(
@ -540,11 +537,11 @@ impl EvalIO for TvixStoreIO {
} else {
// As tvix-store doesn't manage /nix/store on the filesystem,
// we still need to also ask self.std_io here.
self.std_io.read_to_end(path)
self.std_io.open(path)
}
} else {
// The store path is no store path, so do regular StdIO.
self.std_io.read_to_end(path)
self.std_io.open(path)
}
}