fix(tvix/castore/import): check small blobs first

ConcurrentBlobUploader buffers small blobs in memory, and then uploads
them to the BlobService in the background.

In these cases, we know the hash of the whole blob, so we could check if
it exists first before, uploading it.

We were however not, and this caused rate limiting issues in GCS, as it
has an update limit of one write per second on the same key, which we
ran into especially frequently with the empty blob.

This reduces the amount of writes of the same blob considerably.

In the future, we might be able to drop this, as our chunked blob
uploading protocol gets smarter and covers these cases.

Change-Id: Icf482df815812f80a0b65cec0426f8e686308abb
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12497
Tested-by: BuildkiteCI
Autosubmit: flokli <flokli@flokli.de>
Reviewed-by: Connor Brewster <cbrewster@hey.com>
This commit is contained in:
Florian Klink 2024-09-19 11:27:51 +03:00 committed by clbot
parent 1f5a20736a
commit 21e5fc024d

View file

@ -28,6 +28,9 @@ pub enum Error {
#[error("unable to read blob contents for {0}: {1}")]
BlobRead(PathBuf, std::io::Error),
#[error("unable to check whether blob at {0} already exists: {1}")]
BlobCheck(PathBuf, std::io::Error),
// FUTUREWORK: proper error for blob finalize
#[error("unable to finalize blob {0}: {1}")]
BlobFinalize(PathBuf, std::io::Error),
@ -118,6 +121,16 @@ where
let path = path.to_owned();
let r = Cursor::new(buffer);
async move {
// We know the blob digest already, check it exists before sending it.
if blob_service
.has(&expected_digest)
.await
.map_err(|e| Error::BlobCheck(path.clone(), e))?
{
drop(permit);
return Ok(());
}
let digest = upload_blob(&blob_service, &path, expected_size, r).await?;
assert_eq!(digest, expected_digest, "Tvix bug: blob digest mismatch");