fix(tvix): Avoid buffering file into memory in builtins.hashFile

Right now `builtins.hashFile` always reads the entire file into memory
before hashing, which is not ideal for large files. This replaces
`read_to_string` with `open_file` which allows calculating the hash of
the file without buffering it entirely into memory. Other callers can
continue to buffer into memory if they choose, but they still use the
`open_file` VM request and then call `read_to_string` or `read_to_end`
on the `std::io::Reader`.

Fixes b/380

Change-Id: Ifa1c8324bcee8f751604b0b449feab875c632fda
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11236
Reviewed-by: flokli <flokli@flokli.de>
Tested-by: BuildkiteCI
This commit is contained in:
Connor Brewster 2024-03-22 18:52:21 -05:00
parent 17849c5c00
commit 63116d8c21
9 changed files with 80 additions and 74 deletions

View file

@ -6,18 +6,22 @@ use sha2::{digest::Output, Digest, Sha256, Sha512};
use crate::ErrorKind;
fn hash<D: Digest>(b: &[u8]) -> Output<D> {
/// Reads through all data from the passed reader, and returns the resulting [Digest].
/// The exact hash function used is left generic over all [Digest].
fn hash<D: Digest + std::io::Write>(mut r: impl std::io::Read) -> Result<Output<D>, ErrorKind> {
let mut hasher = D::new();
hasher.update(b);
hasher.finalize()
std::io::copy(&mut r, &mut hasher)?;
Ok(hasher.finalize())
}
pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl AsRef<[u8]>) -> Result<String, ErrorKind> {
/// For a given algo "string" and reader for data, calculate the digest
/// and return it as a hexlower encoded [String].
pub fn hash_nix_string(algo: impl AsRef<[u8]>, s: impl std::io::Read) -> Result<String, ErrorKind> {
match algo.as_ref() {
b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s.as_ref()).as_bstr())),
b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s.as_ref()).as_bstr())),
b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s.as_ref()).as_bstr())),
b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s.as_ref()).as_bstr())),
b"md5" => Ok(HEXLOWER.encode(hash::<Md5>(s)?.as_bstr())),
b"sha1" => Ok(HEXLOWER.encode(hash::<Sha1>(s)?.as_bstr())),
b"sha256" => Ok(HEXLOWER.encode(hash::<Sha256>(s)?.as_bstr())),
b"sha512" => Ok(HEXLOWER.encode(hash::<Sha512>(s)?.as_bstr())),
_ => Err(ErrorKind::UnknownHashType(
algo.as_ref().as_bstr().to_string(),
)),

View file

@ -31,14 +31,13 @@ mod impure_builtins {
}
#[builtin("hashFile")]
#[allow(non_snake_case)]
async fn builtin_hashFile(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
async fn builtin_hash_file(co: GenCo, algo: Value, path: Value) -> Result<Value, ErrorKind> {
let path = match coerce_value_to_path(&co, path).await? {
Err(cek) => return Ok(Value::from(cek)),
Ok(p) => p,
};
let s = generators::request_read_to_string(&co, path).await;
hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
let r = generators::request_open_file(&co, path).await;
Ok(hash_nix_string(algo.to_str()?, r).map(Value::from)?)
}
#[builtin("pathExists")]
@ -79,7 +78,13 @@ mod impure_builtins {
async fn builtin_read_file(co: GenCo, path: Value) -> Result<Value, ErrorKind> {
match coerce_value_to_path(&co, path).await? {
Err(cek) => Ok(Value::from(cek)),
Ok(path) => Ok(generators::request_read_to_string(&co, path).await),
Ok(path) => {
let mut buf = Vec::new();
generators::request_open_file(&co, path)
.await
.read_to_end(&mut buf)?;
Ok(Value::from(buf))
}
}
}
}

View file

@ -773,9 +773,8 @@ mod pure_builtins {
}
#[builtin("hashString")]
#[allow(non_snake_case)]
async fn builtin_hashString(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
hash_nix_string(algo.to_str()?, s.to_str()?).map(Value::from)
async fn builtin_hash_string(co: GenCo, algo: Value, s: Value) -> Result<Value, ErrorKind> {
hash_nix_string(algo.to_str()?, std::io::Cursor::new(s.to_str()?)).map(Value::from)
}
#[builtin("head")]

View file

@ -6,7 +6,6 @@
//! instance, or observers).
use super::GlobalsMap;
use bstr::ByteSlice;
use genawaiter::rc::Gen;
use std::rc::Weak;
@ -39,9 +38,11 @@ async fn import_impl(
return Ok(cached);
}
// TODO(tazjin): make this return a string directly instead
let contents: Value = generators::request_read_to_string(&co, path.clone()).await;
let contents = contents.to_str()?.to_str()?.to_owned();
let mut reader = generators::request_open_file(&co, path.clone()).await;
// We read to a String instead of a Vec<u8> because rnix only supports
// string source files.
let mut contents = String::new();
reader.read_to_string(&mut contents)?;
let parsed = rnix::ast::Root::parse(&contents);
let errors = parsed.errors();

View file

@ -16,6 +16,7 @@
//! how store paths are opened and so on.
use std::{
fs::File,
io,
path::{Path, PathBuf},
};
@ -48,13 +49,8 @@ pub trait EvalIO {
/// * `builtins.pathExists :: path -> bool`
fn path_exists(&self, path: &Path) -> io::Result<bool>;
/// Read the file at the specified path to a `Vec<u8>`.
///
/// This is used for the following language evaluation cases:
///
/// * `builtins.readFile :: path -> string`
/// * `builtins.import :: path -> any`
fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>>;
/// Open the file at the specified path to a `io::Read`.
fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>>;
/// Read the directory at the specified path and return the names
/// of its entries associated with their [`FileType`].
@ -99,8 +95,8 @@ impl EvalIO for StdIO {
path.try_exists()
}
fn read_to_end(&self, path: &Path) -> io::Result<Vec<u8>> {
std::fs::read(path)
fn open(&self, path: &Path) -> io::Result<Box<dyn io::Read>> {
Ok(Box::new(File::open(path)?))
}
fn read_dir(&self, path: &Path) -> io::Result<Vec<(bytes::Bytes, FileType)>> {
@ -145,7 +141,7 @@ impl EvalIO for DummyIO {
))
}
fn read_to_end(&self, _: &Path) -> io::Result<Vec<u8>> {
fn open(&self, _: &Path) -> io::Result<Box<dyn io::Read>> {
Err(io::Error::new(
io::ErrorKind::Unsupported,
"I/O methods are not implemented in DummyIO",

View file

@ -102,8 +102,8 @@ pub enum VMRequest {
/// Request that the VM imports the given path through its I/O interface.
PathImport(PathBuf),
/// Request that the VM reads the given path to a string.
ReadToString(PathBuf),
/// Request that the VM opens the specified file and provides a reader.
OpenFile(PathBuf),
/// Request that the VM checks whether the given path exists.
PathExists(PathBuf),
@ -170,8 +170,8 @@ impl Display for VMRequest {
write!(f, "import_cache_put({})", p.to_string_lossy())
}
VMRequest::PathImport(p) => write!(f, "path_import({})", p.to_string_lossy()),
VMRequest::ReadToString(p) => {
write!(f, "read_to_string({})", p.to_string_lossy())
VMRequest::OpenFile(p) => {
write!(f, "open_file({})", p.to_string_lossy())
}
VMRequest::PathExists(p) => write!(f, "path_exists({})", p.to_string_lossy()),
VMRequest::ReadDir(p) => write!(f, "read_dir({})", p.to_string_lossy()),
@ -199,6 +199,9 @@ pub enum VMResponse {
/// VM response with a span to use at the current point.
Span(LightSpan),
/// [std::io::Reader] produced by the VM in response to some IO operation.
Reader(Box<dyn std::io::Read>),
}
impl Display for VMResponse {
@ -209,6 +212,7 @@ impl Display for VMResponse {
VMResponse::Path(p) => write!(f, "path({})", p.to_string_lossy()),
VMResponse::Directory(d) => write!(f, "dir(len = {})", d.len()),
VMResponse::Span(_) => write!(f, "span"),
VMResponse::Reader(_) => write!(f, "reader"),
}
}
}
@ -425,18 +429,18 @@ where
message = VMResponse::Path(imported);
}
VMRequest::ReadToString(path) => {
let content = self
VMRequest::OpenFile(path) => {
let reader = self
.io_handle
.as_ref()
.read_to_end(&path)
.open(&path)
.map_err(|e| ErrorKind::IO {
path: Some(path),
error: e.into(),
})
.with_span(&span, self)?;
message = VMResponse::Value(content.into())
message = VMResponse::Reader(reader)
}
VMRequest::PathExists(path) => {
@ -730,9 +734,10 @@ pub(crate) async fn request_path_import(co: &GenCo, path: PathBuf) -> PathBuf {
}
}
pub(crate) async fn request_read_to_string(co: &GenCo, path: PathBuf) -> Value {
match co.yield_(VMRequest::ReadToString(path)).await {
VMResponse::Value(value) => value,
/// Request that the VM open a [std::io::Read] for the specified file.
pub async fn request_open_file(co: &GenCo, path: PathBuf) -> Box<dyn std::io::Read> {
match co.yield_(VMRequest::OpenFile(path)).await {
VMResponse::Reader(value) => value,
msg => panic!(
"Tvix bug: VM responded with incorrect generator message: {}",
msg