feat(tvix/castore/blobsvc/grpc): read data in chunks

Whenever this encounters an open_read(), it'll first check for more
granular chunking. If there's more granular chunking data available, a
ChunkedReader is constructed (which supports seeking backwards).

This currently is still a bit stupid, and doesn't compose, as
`ChunkedReader` uses `self` as the `BlobService` to ask for the
individual chunks.

In store composition future, we might want to compose this differently,
essentially constructing `ChunkedReader` with another `BlobService`
representing the entire hierarchy, so there's a chance to locally cache
things, and do less requests.

Change-Id: I22e0df4d6245f666d083b4f0b7114d3ac41d1dce
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11185
Tested-by: BuildkiteCI
Reviewed-by: Connor Brewster <cbrewster@hey.com>
This commit is contained in:
Florian Klink 2024-03-18 14:33:08 +02:00 committed by flokli
parent 50c81d7838
commit 82f8ce8b7d

View file

@ -1,10 +1,10 @@
use super::{naive_seeker::NaiveSeeker, BlobReader, BlobService, BlobWriter};
use super::{naive_seeker::NaiveSeeker, BlobReader, BlobService, BlobWriter, ChunkedReader};
use crate::{
proto::{self, stat_blob_response::ChunkMeta},
B3Digest,
};
use futures::sink::SinkExt;
use std::{io, pin::pin, task::Poll};
use std::{io, pin::pin, sync::Arc, task::Poll};
use tokio::io::AsyncWriteExt;
use tokio::task::JoinHandle;
use tokio_stream::{wrappers::ReceiverStream, StreamExt};
@ -54,9 +54,20 @@ impl BlobService for GRPCBlobService {
#[instrument(skip(self, digest), fields(blob.digest=%digest), err)]
async fn open_read(&self, digest: &B3Digest) -> io::Result<Option<Box<dyn BlobReader>>> {
// First try to get a list of chunks. In case there's only one chunk returned,
// or the backend does not support chunking, return a NaiveSeeker.
// Otherwise use a ChunkedReader.
// TODO: we should check if we want to replace NaiveSeeker with a simple
// Cursor on the raw `Vec<u8>`, as seeking backwards is something that
// clients generally do.
match self.chunks(digest).await {
Ok(None) => Ok(None),
Ok(Some(chunks)) => {
if chunks.is_empty() || chunks.len() == 1 {
// No more granular chunking info, treat this as an individual chunk.
// Get a stream of [proto::BlobChunk], or return an error if the blob
// doesn't exist.
match self
return match self
.grpc_client
.clone()
.read(proto::ReadBlobRequest {
@ -65,8 +76,6 @@ impl BlobService for GRPCBlobService {
.await
{
Ok(stream) => {
// on success, this is a stream of tonic::Result<proto::BlobChunk>,
// so access .data and map errors into std::io::Error.
let data_stream = stream.into_inner().map(|e| {
e.map(|c| c.data)
.map_err(|s| std::io::Error::new(io::ErrorKind::InvalidData, s))
@ -79,6 +88,25 @@ impl BlobService for GRPCBlobService {
}
Err(e) if e.code() == Code::NotFound => Ok(None),
Err(e) => Err(io::Error::new(io::ErrorKind::Other, e)),
};
}
// The chunked case. Let ChunkedReader do individual reads.
// TODO: we should store the chunking data in some local cache,
// so `ChunkedReader` doesn't call `self.chunks` *again* for every chunk.
// Think about how store composition will fix this.
let chunked_reader = ChunkedReader::from_chunks(
chunks.into_iter().map(|chunk| {
(
chunk.digest.try_into().expect("invalid b3 digest"),
chunk.size,
)
}),
Arc::new(self.clone()) as Arc<dyn BlobService>,
);
Ok(Some(Box::new(chunked_reader)))
}
Err(e) => Err(e)?,
}
}