2023-05-25 16:52:08 +02:00
|
|
|
use crate::blobservice::BlobService;
|
2023-09-21 21:32:44 +02:00
|
|
|
use crate::directoryservice::DirectoryPutter;
|
2023-05-25 16:52:08 +02:00
|
|
|
use crate::directoryservice::DirectoryService;
|
2023-09-21 21:32:44 +02:00
|
|
|
use crate::proto::node::Node;
|
|
|
|
use crate::proto::Directory;
|
|
|
|
use crate::proto::DirectoryNode;
|
|
|
|
use crate::proto::FileNode;
|
|
|
|
use crate::proto::SymlinkNode;
|
|
|
|
use crate::Error as CastoreError;
|
2023-07-18 18:37:25 +02:00
|
|
|
use std::os::unix::ffi::OsStrExt;
|
2023-02-27 13:03:53 +01:00
|
|
|
use std::{
|
|
|
|
collections::HashMap,
|
|
|
|
fmt::Debug,
|
|
|
|
os::unix::prelude::PermissionsExt,
|
|
|
|
path::{Path, PathBuf},
|
|
|
|
};
|
|
|
|
use tracing::instrument;
|
|
|
|
use walkdir::WalkDir;
|
|
|
|
|
|
|
|
#[derive(Debug, thiserror::Error)]
|
|
|
|
pub enum Error {
|
|
|
|
#[error("failed to upload directory at {0}: {1}")]
|
2023-09-21 21:32:44 +02:00
|
|
|
UploadDirectoryError(PathBuf, CastoreError),
|
2023-02-27 13:03:53 +01:00
|
|
|
|
|
|
|
#[error("invalid encoding encountered for entry {0:?}")]
|
|
|
|
InvalidEncoding(PathBuf),
|
|
|
|
|
|
|
|
#[error("unable to stat {0}: {1}")]
|
|
|
|
UnableToStat(PathBuf, std::io::Error),
|
|
|
|
|
|
|
|
#[error("unable to open {0}: {1}")]
|
|
|
|
UnableToOpen(PathBuf, std::io::Error),
|
|
|
|
|
|
|
|
#[error("unable to read {0}: {1}")]
|
|
|
|
UnableToRead(PathBuf, std::io::Error),
|
|
|
|
}
|
|
|
|
|
2023-09-21 21:32:44 +02:00
|
|
|
impl From<CastoreError> for Error {
|
|
|
|
fn from(value: CastoreError) -> Self {
|
2023-02-27 13:03:53 +01:00
|
|
|
match value {
|
2023-09-21 21:32:44 +02:00
|
|
|
CastoreError::InvalidRequest(_) => panic!("tvix bug"),
|
|
|
|
CastoreError::StorageError(_) => panic!("error"),
|
2023-02-27 13:03:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-08 09:45:55 +01:00
|
|
|
impl From<Error> for std::io::Error {
|
|
|
|
fn from(value: Error) -> Self {
|
|
|
|
std::io::Error::new(std::io::ErrorKind::Other, value)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-06 00:39:21 +01:00
|
|
|
/// This processes a given [walkdir::DirEntry] and returns a
|
|
|
|
/// proto::node::Node, depending on the type of the entry.
|
|
|
|
///
|
|
|
|
/// If the entry is a file, its contents are uploaded.
|
|
|
|
/// If the entry is a directory, the Directory is uploaded as well.
|
|
|
|
/// For this to work, it relies on the caller to provide the directory object
|
|
|
|
/// with the previously returned (child) nodes.
|
|
|
|
///
|
|
|
|
/// It assumes entries to be returned in "contents first" order, means this
|
|
|
|
/// will only be called with a directory if all children of it have been
|
|
|
|
/// visited. If the entry is indeed a directory, it'll also upload that
|
|
|
|
/// directory to the store. For this, the so-far-assembled Directory object for
|
|
|
|
/// this path needs to be passed in.
|
|
|
|
///
|
|
|
|
/// It assumes the caller adds returned nodes to the directories it assembles.
|
2023-02-27 13:03:53 +01:00
|
|
|
#[instrument(skip_all, fields(entry.file_type=?&entry.file_type(),entry.path=?entry.path()))]
|
2023-12-31 21:30:56 +01:00
|
|
|
async fn process_entry<'a, BS>(
|
|
|
|
blob_service: BS,
|
2023-10-03 18:53:12 +02:00
|
|
|
directory_putter: &'a mut Box<dyn DirectoryPutter>,
|
|
|
|
entry: &'a walkdir::DirEntry,
|
2023-09-21 21:32:44 +02:00
|
|
|
maybe_directory: Option<Directory>,
|
2023-12-31 21:30:56 +01:00
|
|
|
) -> Result<Node, Error>
|
|
|
|
where
|
2024-01-09 10:04:29 +01:00
|
|
|
BS: AsRef<dyn BlobService> + Clone,
|
2023-12-31 21:30:56 +01:00
|
|
|
{
|
2023-02-27 13:03:53 +01:00
|
|
|
let file_type = entry.file_type();
|
|
|
|
|
|
|
|
if file_type.is_dir() {
|
|
|
|
let directory = maybe_directory
|
|
|
|
.expect("tvix bug: must be called with some directory in the case of directory");
|
|
|
|
let directory_digest = directory.digest();
|
|
|
|
let directory_size = directory.size();
|
|
|
|
|
|
|
|
// upload this directory
|
2023-03-27 17:24:53 +02:00
|
|
|
directory_putter
|
2023-02-27 13:03:53 +01:00
|
|
|
.put(directory)
|
2023-09-19 18:46:41 +02:00
|
|
|
.await
|
2023-02-27 13:03:53 +01:00
|
|
|
.map_err(|e| Error::UploadDirectoryError(entry.path().to_path_buf(), e))?;
|
|
|
|
|
2023-09-21 21:32:44 +02:00
|
|
|
return Ok(Node::Directory(DirectoryNode {
|
2023-07-19 17:52:50 +02:00
|
|
|
name: entry.file_name().as_bytes().to_owned().into(),
|
|
|
|
digest: directory_digest.into(),
|
2023-02-27 13:03:53 +01:00
|
|
|
size: directory_size,
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
|
|
|
|
if file_type.is_symlink() {
|
2023-07-19 17:52:50 +02:00
|
|
|
let target: bytes::Bytes = std::fs::read_link(entry.path())
|
|
|
|
.map_err(|e| Error::UnableToStat(entry.path().to_path_buf(), e))?
|
|
|
|
.as_os_str()
|
|
|
|
.as_bytes()
|
|
|
|
.to_owned()
|
|
|
|
.into();
|
2023-02-27 13:03:53 +01:00
|
|
|
|
2023-09-21 21:32:44 +02:00
|
|
|
return Ok(Node::Symlink(SymlinkNode {
|
2023-07-19 17:52:50 +02:00
|
|
|
name: entry.file_name().as_bytes().to_owned().into(),
|
|
|
|
target,
|
2023-02-27 13:03:53 +01:00
|
|
|
}));
|
|
|
|
}
|
|
|
|
|
|
|
|
if file_type.is_file() {
|
|
|
|
let metadata = entry
|
|
|
|
.metadata()
|
2023-07-19 17:52:50 +02:00
|
|
|
.map_err(|e| Error::UnableToStat(entry.path().to_path_buf(), e.into()))?;
|
2023-02-27 13:03:53 +01:00
|
|
|
|
refactor(tvix/store/blobsvc): make BlobStore async
We previously kept the trait of a BlobService sync.
This however had some annoying consequences:
- It became more and more complicated to track when we're in a context
with an async runtime in the context or not, producing bugs like
https://b.tvl.fyi/issues/304
- The sync trait shielded away async clients from async worloads,
requiring manual block_on code inside the gRPC client code, and
spawn_blocking calls in consumers of the trait, even if they were
async (like the gRPC server)
- We had to write our own custom glue code (SyncReadIntoAsyncRead)
to convert a sync io::Read into a tokio::io::AsyncRead, which already
existed in tokio internally, but upstream ia hesitant to expose.
This now makes the BlobService trait async (via the async_trait macro,
like we already do in various gRPC parts), and replaces the sync readers
and writers with their async counterparts.
Tests interacting with a BlobService now need to have an async runtime
available, the easiest way for this is to mark the test functions
with the tokio::test macro, allowing us to directly .await in the test
function.
In places where we don't have an async runtime available from context
(like tvix-cli), we can pass one down explicitly.
Now that we don't provide a sync interface anymore, the (sync) FUSE
library now holds a pointer to a tokio runtime handle, and needs to at
least have 2 threads available when talking to a blob service (which is
why some of the tests now use the multi_thread flavor).
The FUSE tests got a bit more verbose, as we couldn't use the
setup_and_mount function accepting a callback anymore. We can hopefully
move some of the test fixture setup to rstest in the future to make this
less repetitive.
Co-Authored-By: Connor Brewster <cbrewster@hey.com>
Change-Id: Ia0501b606e32c852d0108de9c9016b21c94a3c05
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9329
Reviewed-by: Connor Brewster <cbrewster@hey.com>
Tested-by: BuildkiteCI
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
2023-09-13 14:20:21 +02:00
|
|
|
let mut file = tokio::fs::File::open(entry.path())
|
|
|
|
.await
|
2023-07-19 17:52:50 +02:00
|
|
|
.map_err(|e| Error::UnableToOpen(entry.path().to_path_buf(), e))?;
|
2023-03-10 14:23:36 +01:00
|
|
|
|
2024-01-09 10:04:29 +01:00
|
|
|
let mut writer = blob_service.as_ref().open_write().await;
|
refactor(tvix/store): remove ChunkService
Whether chunking is involved or not, is an implementation detail of each
Blobstore. Consumers of a whole blob shouldn't need to worry about that.
It currently is not visible in the gRPC interface either. It
shouldn't bleed into everything.
Let the BlobService trait provide `open_read` and `open_write` methods,
which return handles providing io::Read or io::Write, and leave the
details up to the implementation.
This means, our custom BlobReader module can go away, and all the
chunking bits in there, too.
In the future, we might still want to add more chunking-aware syncing,
but as a syncing strategy some stores can expose, not as a fundamental
protocol component.
This currently needs "SyncReadIntoAsyncRead", taken and vendored in from
https://github.com/tokio-rs/tokio/pull/5669.
It provides a AsyncRead for a sync Read, which is necessary to connect
our (sync) BlobReader interface to a GRPC server implementation.
As an alternative, we could also make the BlobReader itself async, and
let consumers of the trait (EvalIO) deal with the async-ness, but this
is less of a change for now.
In terms of vendoring, I initially tried to move our tokio crate to
these commits, but ended up in version incompatibilities, so let's
vendor it in for now.
Change-Id: I5969ebbc4c0e1ceece47981be3b9e7cfb3f59ad0
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8551
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
2023-05-11 14:49:01 +02:00
|
|
|
|
refactor(tvix/store/blobsvc): make BlobStore async
We previously kept the trait of a BlobService sync.
This however had some annoying consequences:
- It became more and more complicated to track when we're in a context
with an async runtime in the context or not, producing bugs like
https://b.tvl.fyi/issues/304
- The sync trait shielded away async clients from async worloads,
requiring manual block_on code inside the gRPC client code, and
spawn_blocking calls in consumers of the trait, even if they were
async (like the gRPC server)
- We had to write our own custom glue code (SyncReadIntoAsyncRead)
to convert a sync io::Read into a tokio::io::AsyncRead, which already
existed in tokio internally, but upstream ia hesitant to expose.
This now makes the BlobService trait async (via the async_trait macro,
like we already do in various gRPC parts), and replaces the sync readers
and writers with their async counterparts.
Tests interacting with a BlobService now need to have an async runtime
available, the easiest way for this is to mark the test functions
with the tokio::test macro, allowing us to directly .await in the test
function.
In places where we don't have an async runtime available from context
(like tvix-cli), we can pass one down explicitly.
Now that we don't provide a sync interface anymore, the (sync) FUSE
library now holds a pointer to a tokio runtime handle, and needs to at
least have 2 threads available when talking to a blob service (which is
why some of the tests now use the multi_thread flavor).
The FUSE tests got a bit more verbose, as we couldn't use the
setup_and_mount function accepting a callback anymore. We can hopefully
move some of the test fixture setup to rstest in the future to make this
less repetitive.
Co-Authored-By: Connor Brewster <cbrewster@hey.com>
Change-Id: Ia0501b606e32c852d0108de9c9016b21c94a3c05
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9329
Reviewed-by: Connor Brewster <cbrewster@hey.com>
Tested-by: BuildkiteCI
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
2023-09-13 14:20:21 +02:00
|
|
|
if let Err(e) = tokio::io::copy(&mut file, &mut writer).await {
|
2023-07-19 17:52:50 +02:00
|
|
|
return Err(Error::UnableToRead(entry.path().to_path_buf(), e));
|
refactor(tvix/store): remove ChunkService
Whether chunking is involved or not, is an implementation detail of each
Blobstore. Consumers of a whole blob shouldn't need to worry about that.
It currently is not visible in the gRPC interface either. It
shouldn't bleed into everything.
Let the BlobService trait provide `open_read` and `open_write` methods,
which return handles providing io::Read or io::Write, and leave the
details up to the implementation.
This means, our custom BlobReader module can go away, and all the
chunking bits in there, too.
In the future, we might still want to add more chunking-aware syncing,
but as a syncing strategy some stores can expose, not as a fundamental
protocol component.
This currently needs "SyncReadIntoAsyncRead", taken and vendored in from
https://github.com/tokio-rs/tokio/pull/5669.
It provides a AsyncRead for a sync Read, which is necessary to connect
our (sync) BlobReader interface to a GRPC server implementation.
As an alternative, we could also make the BlobReader itself async, and
let consumers of the trait (EvalIO) deal with the async-ness, but this
is less of a change for now.
In terms of vendoring, I initially tried to move our tokio crate to
these commits, but ended up in version incompatibilities, so let's
vendor it in for now.
Change-Id: I5969ebbc4c0e1ceece47981be3b9e7cfb3f59ad0
Reviewed-on: https://cl.tvl.fyi/c/depot/+/8551
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
2023-05-11 14:49:01 +02:00
|
|
|
};
|
|
|
|
|
2023-12-12 14:48:01 +01:00
|
|
|
let digest = writer
|
|
|
|
.close()
|
|
|
|
.await
|
|
|
|
.map_err(|e| Error::UnableToRead(entry.path().to_path_buf(), e))?;
|
2023-02-27 13:03:53 +01:00
|
|
|
|
2023-09-21 21:32:44 +02:00
|
|
|
return Ok(Node::File(FileNode {
|
2023-07-19 17:52:50 +02:00
|
|
|
name: entry.file_name().as_bytes().to_vec().into(),
|
|
|
|
digest: digest.into(),
|
2023-11-05 09:53:42 +01:00
|
|
|
size: metadata.len(),
|
2023-02-27 13:03:53 +01:00
|
|
|
// If it's executable by the user, it'll become executable.
|
|
|
|
// This matches nix's dump() function behaviour.
|
|
|
|
executable: metadata.permissions().mode() & 64 != 0,
|
|
|
|
}));
|
|
|
|
}
|
|
|
|
todo!("handle other types")
|
|
|
|
}
|
|
|
|
|
2023-05-17 13:21:26 +02:00
|
|
|
/// Ingests the contents at the given path into the tvix store,
|
|
|
|
/// interacting with a [BlobService] and [DirectoryService].
|
|
|
|
/// It returns the root node or an error.
|
2023-02-27 13:03:53 +01:00
|
|
|
///
|
2023-09-05 16:08:50 +02:00
|
|
|
/// It does not follow symlinks at the root, they will be ingested as actual
|
|
|
|
/// symlinks.
|
|
|
|
///
|
2023-09-21 21:32:44 +02:00
|
|
|
/// It's not interacting with a PathInfoService (from tvix-store), or anything
|
|
|
|
/// else giving it a "non-content-addressed name".
|
|
|
|
/// It's up to the caller to possibly register it somewhere (and potentially
|
|
|
|
/// rename it based on some naming scheme)
|
2023-12-12 14:49:58 +01:00
|
|
|
#[instrument(skip(blob_service, directory_service), fields(path=?p), err)]
|
2024-01-04 18:18:06 +01:00
|
|
|
pub async fn ingest_path<'a, BS, DS, P>(
|
2023-12-31 21:30:56 +01:00
|
|
|
blob_service: BS,
|
|
|
|
directory_service: DS,
|
2023-02-27 13:03:53 +01:00
|
|
|
p: P,
|
2023-12-31 21:30:56 +01:00
|
|
|
) -> Result<Node, Error>
|
|
|
|
where
|
|
|
|
P: AsRef<Path> + Debug,
|
2024-01-09 10:04:29 +01:00
|
|
|
BS: AsRef<dyn BlobService> + Clone,
|
|
|
|
DS: AsRef<dyn DirectoryService>,
|
2023-12-31 21:30:56 +01:00
|
|
|
{
|
2023-09-21 21:32:44 +02:00
|
|
|
let mut directories: HashMap<PathBuf, Directory> = HashMap::default();
|
2023-02-27 13:03:53 +01:00
|
|
|
|
2024-01-09 10:04:29 +01:00
|
|
|
let mut directory_putter = directory_service.as_ref().put_multiple_start();
|
2023-03-27 17:24:53 +02:00
|
|
|
|
2023-12-12 14:55:55 +01:00
|
|
|
for entry in WalkDir::new(p.as_ref())
|
2023-02-27 18:07:16 +01:00
|
|
|
.follow_links(false)
|
2023-09-05 16:08:50 +02:00
|
|
|
.follow_root_links(false)
|
2023-10-08 17:58:08 +02:00
|
|
|
// We need to process a directory's children before processing
|
|
|
|
// the directory itself in order to have all the data needed
|
|
|
|
// to compute the hash.
|
2023-02-27 18:07:16 +01:00
|
|
|
.contents_first(true)
|
|
|
|
.sort_by_file_name()
|
|
|
|
{
|
2023-12-12 14:55:55 +01:00
|
|
|
// Entry could be a NotFound, if the root path specified does not exist.
|
|
|
|
let entry = entry.map_err(|e| {
|
|
|
|
Error::UnableToOpen(
|
|
|
|
PathBuf::from(p.as_ref()),
|
|
|
|
e.into_io_error().expect("walkdir err must be some"),
|
|
|
|
)
|
|
|
|
})?;
|
2023-02-27 13:03:53 +01:00
|
|
|
|
|
|
|
// process_entry wants an Option<Directory> in case the entry points to a directory.
|
|
|
|
// make sure to provide it.
|
2023-10-08 17:58:08 +02:00
|
|
|
// If the directory has contents, we already have it in
|
|
|
|
// `directories` due to the use of contents_first on WalkDir.
|
2023-09-21 21:32:44 +02:00
|
|
|
let maybe_directory: Option<Directory> = {
|
2023-02-27 13:03:53 +01:00
|
|
|
if entry.file_type().is_dir() {
|
|
|
|
Some(
|
|
|
|
directories
|
|
|
|
.entry(entry.path().to_path_buf())
|
|
|
|
.or_default()
|
|
|
|
.clone(),
|
|
|
|
)
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2023-06-09 17:22:25 +02:00
|
|
|
let node = process_entry(
|
|
|
|
blob_service.clone(),
|
|
|
|
&mut directory_putter,
|
|
|
|
&entry,
|
|
|
|
maybe_directory,
|
refactor(tvix/store/blobsvc): make BlobStore async
We previously kept the trait of a BlobService sync.
This however had some annoying consequences:
- It became more and more complicated to track when we're in a context
with an async runtime in the context or not, producing bugs like
https://b.tvl.fyi/issues/304
- The sync trait shielded away async clients from async worloads,
requiring manual block_on code inside the gRPC client code, and
spawn_blocking calls in consumers of the trait, even if they were
async (like the gRPC server)
- We had to write our own custom glue code (SyncReadIntoAsyncRead)
to convert a sync io::Read into a tokio::io::AsyncRead, which already
existed in tokio internally, but upstream ia hesitant to expose.
This now makes the BlobService trait async (via the async_trait macro,
like we already do in various gRPC parts), and replaces the sync readers
and writers with their async counterparts.
Tests interacting with a BlobService now need to have an async runtime
available, the easiest way for this is to mark the test functions
with the tokio::test macro, allowing us to directly .await in the test
function.
In places where we don't have an async runtime available from context
(like tvix-cli), we can pass one down explicitly.
Now that we don't provide a sync interface anymore, the (sync) FUSE
library now holds a pointer to a tokio runtime handle, and needs to at
least have 2 threads available when talking to a blob service (which is
why some of the tests now use the multi_thread flavor).
The FUSE tests got a bit more verbose, as we couldn't use the
setup_and_mount function accepting a callback anymore. We can hopefully
move some of the test fixture setup to rstest in the future to make this
less repetitive.
Co-Authored-By: Connor Brewster <cbrewster@hey.com>
Change-Id: Ia0501b606e32c852d0108de9c9016b21c94a3c05
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9329
Reviewed-by: Connor Brewster <cbrewster@hey.com>
Tested-by: BuildkiteCI
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
2023-09-13 14:20:21 +02:00
|
|
|
)
|
|
|
|
.await?;
|
2023-02-27 13:03:53 +01:00
|
|
|
|
|
|
|
if entry.depth() == 0 {
|
2023-10-17 04:33:43 +02:00
|
|
|
// Make sure all the directories are flushed.
|
|
|
|
if entry.file_type().is_dir() {
|
|
|
|
directory_putter.close().await?;
|
|
|
|
}
|
2023-02-27 13:03:53 +01:00
|
|
|
return Ok(node);
|
|
|
|
} else {
|
|
|
|
// calculate the parent path, and make sure we register the node there.
|
|
|
|
// NOTE: entry.depth() > 0
|
|
|
|
let parent_path = entry.path().parent().unwrap().to_path_buf();
|
|
|
|
|
|
|
|
// record node in parent directory, creating a new [proto:Directory] if not there yet.
|
|
|
|
let parent_directory = directories.entry(parent_path).or_default();
|
|
|
|
match node {
|
2023-09-21 21:32:44 +02:00
|
|
|
Node::Directory(e) => parent_directory.directories.push(e),
|
|
|
|
Node::File(e) => parent_directory.files.push(e),
|
|
|
|
Node::Symlink(e) => parent_directory.symlinks.push(e),
|
2023-02-27 13:03:53 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// unreachable, we already bailed out before if root doesn't exist.
|
2023-11-24 16:53:01 +01:00
|
|
|
unreachable!()
|
2023-02-27 13:03:53 +01:00
|
|
|
}
|