feat(tvix/nix-compat): NAR reader
Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688 Reviewed-by: raitobezarius <tvl@lahfa.xyz> Tested-by: BuildkiteCI Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
parent
08b98b7503
commit
6638f4d4ea
8 changed files with 530 additions and 4 deletions
|
@ -1 +1,4 @@
|
|||
mod wire;
|
||||
|
||||
pub mod reader;
|
||||
pub mod writer;
|
||||
|
|
229
tvix/nix-compat/src/nar/reader/mod.rs
Normal file
229
tvix/nix-compat/src/nar/reader/mod.rs
Normal file
|
@ -0,0 +1,229 @@
|
|||
//! Parser for the Nix archive format, aka NAR.
|
||||
//!
|
||||
//! NAR files (and their hashed representations) are used in C++ Nix for
|
||||
//! a variety of things, including addressing fixed-output derivations
|
||||
//! and transferring store paths between Nix stores.
|
||||
|
||||
use std::io::{
|
||||
self,
|
||||
ErrorKind::{InvalidData, UnexpectedEof},
|
||||
Read,
|
||||
};
|
||||
|
||||
// Required reading for understanding this module.
|
||||
use crate::nar::wire;
|
||||
|
||||
mod read;
|
||||
|
||||
pub type Reader<'a> = dyn Read + Send + 'a;
|
||||
|
||||
/// Start reading a NAR file from `reader`.
|
||||
pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
|
||||
read::token(reader, &wire::TOK_NAR)?;
|
||||
Node::new(reader)
|
||||
}
|
||||
|
||||
pub enum Node<'a, 'r> {
|
||||
Symlink {
|
||||
target: Vec<u8>,
|
||||
},
|
||||
File {
|
||||
executable: bool,
|
||||
reader: FileReader<'a, 'r>,
|
||||
},
|
||||
Directory(DirReader<'a, 'r>),
|
||||
}
|
||||
|
||||
impl<'a, 'r> Node<'a, 'r> {
|
||||
/// Start reading a [Node], matching the next [wire::Node].
|
||||
///
|
||||
/// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
|
||||
/// but is otherwise left to [DirReader] or [FileReader].
|
||||
fn new(reader: &'a mut Reader<'r>) -> io::Result<Self> {
|
||||
Ok(match read::tag(reader)? {
|
||||
wire::Node::Sym => {
|
||||
let target = read::bytes(reader, wire::MAX_TARGET_LEN)?;
|
||||
|
||||
if target.is_empty() || target.contains(&0) {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
|
||||
read::token(reader, &wire::TOK_PAR)?;
|
||||
|
||||
Node::Symlink { target }
|
||||
}
|
||||
tag @ (wire::Node::Reg | wire::Node::Exe) => {
|
||||
let len = read::u64(reader)?;
|
||||
|
||||
Node::File {
|
||||
executable: tag == wire::Node::Exe,
|
||||
reader: FileReader::new(reader, len)?,
|
||||
}
|
||||
}
|
||||
wire::Node::Dir => Node::Directory(DirReader::new(reader)),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// File contents, readable through the [Read] trait.
|
||||
///
|
||||
/// It comes with some caveats:
|
||||
/// * You must always read the entire file, unless you intend to abandon the entire archive reader.
|
||||
/// * You must abandon the entire archive reader upon the first error.
|
||||
///
|
||||
/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
|
||||
///
|
||||
/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
|
||||
pub struct FileReader<'a, 'r> {
|
||||
reader: &'a mut Reader<'r>,
|
||||
len: u64,
|
||||
/// Truncated original file length for padding computation.
|
||||
/// We only care about the 3 least significant bits; semantically, this is a u3.
|
||||
pad: u8,
|
||||
}
|
||||
|
||||
impl<'a, 'r> FileReader<'a, 'r> {
|
||||
/// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
|
||||
/// We handle the terminating [wire::TOK_PAR] on semantic EOF.
|
||||
fn new(reader: &'a mut Reader<'r>, len: u64) -> io::Result<Self> {
|
||||
// For zero-length files, we have to read the terminating TOK_PAR
|
||||
// immediately, since FileReader::read may never be called; we've
|
||||
// already reached semantic EOF by definition.
|
||||
if len == 0 {
|
||||
read::token(reader, &wire::TOK_PAR)?;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
reader,
|
||||
len,
|
||||
pad: len as u8,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len == 0
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u64 {
|
||||
self.len
|
||||
}
|
||||
}
|
||||
|
||||
impl Read for FileReader<'_, '_> {
|
||||
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
||||
if buf.is_empty() || self.is_empty() {
|
||||
return Ok(0);
|
||||
}
|
||||
|
||||
if buf.len() as u64 > self.len {
|
||||
buf = &mut buf[..self.len as usize];
|
||||
}
|
||||
|
||||
let n = self.reader.read(buf)?;
|
||||
self.len -= n as u64;
|
||||
|
||||
if n == 0 {
|
||||
return Err(UnexpectedEof.into());
|
||||
}
|
||||
|
||||
// If we've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
|
||||
// Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
|
||||
if self.is_empty() {
|
||||
let pad = (self.pad & 7) as usize;
|
||||
|
||||
if pad != 0 {
|
||||
let mut buf = [0; 8];
|
||||
self.reader.read_exact(&mut buf[pad..])?;
|
||||
|
||||
if buf != [0; 8] {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
}
|
||||
|
||||
read::token(self.reader, &wire::TOK_PAR)?;
|
||||
}
|
||||
|
||||
Ok(n)
|
||||
}
|
||||
}
|
||||
|
||||
/// A directory iterator, yielding a sequence of [Node]s.
|
||||
/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
|
||||
pub struct DirReader<'a, 'r> {
|
||||
reader: &'a mut Reader<'r>,
|
||||
/// Previous directory entry name.
|
||||
/// We have to hang onto this to enforce name monotonicity.
|
||||
prev_name: Option<Vec<u8>>,
|
||||
}
|
||||
|
||||
pub struct Entry<'a, 'r> {
|
||||
pub name: Vec<u8>,
|
||||
pub node: Node<'a, 'r>,
|
||||
}
|
||||
|
||||
impl<'a, 'r> DirReader<'a, 'r> {
|
||||
fn new(reader: &'a mut Reader<'r>) -> Self {
|
||||
Self {
|
||||
reader,
|
||||
prev_name: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Read the next [Entry] from the directory.
|
||||
///
|
||||
/// We explicitly don't implement [Iterator], since treating this as
|
||||
/// a regular Rust iterator will surely lead you astray.
|
||||
///
|
||||
/// * You must always consume the entire iterator, unless you abandon the entire archive reader.
|
||||
/// * You must abandon the entire archive reader on the first error.
|
||||
/// * You must abandon the directory reader upon the first [None].
|
||||
/// * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
|
||||
///
|
||||
/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
|
||||
#[allow(clippy::should_implement_trait)]
|
||||
pub fn next(&mut self) -> io::Result<Option<Entry>> {
|
||||
// COME FROM the previous iteration: if we've already read an entry,
|
||||
// read its terminating TOK_PAR here.
|
||||
if self.prev_name.is_some() {
|
||||
read::token(self.reader, &wire::TOK_PAR)?;
|
||||
}
|
||||
|
||||
// Determine if there are more entries to follow
|
||||
if let wire::Entry::None = read::tag(self.reader)? {
|
||||
// We've reached the end of this directory.
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let name = read::bytes(self.reader, wire::MAX_NAME_LEN)?;
|
||||
|
||||
if name.is_empty()
|
||||
|| name.contains(&0)
|
||||
|| name.contains(&b'/')
|
||||
|| name == b"."
|
||||
|| name == b".."
|
||||
{
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
|
||||
// Enforce strict monotonicity of directory entry names.
|
||||
match &mut self.prev_name {
|
||||
None => {
|
||||
self.prev_name = Some(name.clone());
|
||||
}
|
||||
Some(prev_name) => {
|
||||
if *prev_name >= name {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
|
||||
name[..].clone_into(prev_name);
|
||||
}
|
||||
}
|
||||
|
||||
read::token(self.reader, &wire::TOK_NOD)?;
|
||||
|
||||
Ok(Some(Entry {
|
||||
name,
|
||||
node: Node::new(&mut self.reader)?,
|
||||
}))
|
||||
}
|
||||
}
|
109
tvix/nix-compat/src/nar/reader/read.rs
Normal file
109
tvix/nix-compat/src/nar/reader/read.rs
Normal file
|
@ -0,0 +1,109 @@
|
|||
//! Helpers for reading [crate::nar::wire] format.
|
||||
|
||||
use std::io::{
|
||||
self,
|
||||
ErrorKind::{Interrupted, InvalidData, UnexpectedEof},
|
||||
};
|
||||
|
||||
use super::Reader;
|
||||
use crate::nar::wire::Tag;
|
||||
|
||||
/// Consume a little-endian [u64] from the reader.
|
||||
pub fn u64(reader: &mut Reader) -> io::Result<u64> {
|
||||
let mut buf = [0; 8];
|
||||
reader.read_exact(&mut buf)?;
|
||||
Ok(u64::from_le_bytes(buf))
|
||||
}
|
||||
|
||||
/// Consume a byte string of up to `max_len` bytes from the reader.
|
||||
pub fn bytes(reader: &mut Reader, max_len: usize) -> io::Result<Vec<u8>> {
|
||||
assert!(max_len <= isize::MAX as usize);
|
||||
|
||||
// read the length, and reject excessively large values
|
||||
let len = self::u64(reader)?;
|
||||
if len > max_len as u64 {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
// we know the length fits in a usize now
|
||||
let len = len as usize;
|
||||
|
||||
// read the data and padding into a buffer
|
||||
let buf_len = (len + 7) & !7;
|
||||
let mut buf = vec![0; buf_len];
|
||||
reader.read_exact(&mut buf)?;
|
||||
|
||||
// verify that the padding is all zeroes
|
||||
for b in buf.drain(len..) {
|
||||
if b != 0 {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(buf)
|
||||
}
|
||||
|
||||
/// Consume a known token from the reader.
|
||||
pub fn token<const N: usize>(reader: &mut Reader, token: &[u8; N]) -> io::Result<()> {
|
||||
let mut buf = [0u8; N];
|
||||
|
||||
// This implements something similar to [Read::read_exact], but verifies that
|
||||
// the input data matches the token while we read it. These two slices respectively
|
||||
// represent the remaining token to be verified, and the remaining input buffer.
|
||||
let mut token = &token[..];
|
||||
let mut buf = &mut buf[..];
|
||||
|
||||
while !token.is_empty() {
|
||||
match reader.read(buf) {
|
||||
Ok(0) => {
|
||||
return Err(UnexpectedEof.into());
|
||||
}
|
||||
Ok(n) => {
|
||||
let (t, b);
|
||||
(t, token) = token.split_at(n);
|
||||
(b, buf) = buf.split_at_mut(n);
|
||||
|
||||
if t != b {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
if e.kind() != Interrupted {
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Consume a [Tag] from the reader.
|
||||
pub fn tag<T: Tag>(reader: &mut Reader) -> io::Result<T> {
|
||||
let mut buf = T::make_buf();
|
||||
let buf = buf.as_mut();
|
||||
|
||||
// first read the known minimum length…
|
||||
reader.read_exact(&mut buf[..T::MIN])?;
|
||||
|
||||
// then decide which tag we're expecting
|
||||
let tag = T::from_u8(buf[T::OFF]).ok_or(InvalidData)?;
|
||||
let (head, tail) = tag.as_bytes().split_at(T::MIN);
|
||||
|
||||
// make sure what we've read so far is valid
|
||||
if buf[..T::MIN] != *head {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
|
||||
// …then read the rest, if any
|
||||
if !tail.is_empty() {
|
||||
let rest = tail.len();
|
||||
reader.read_exact(&mut buf[..rest])?;
|
||||
|
||||
// and make sure it's what we expect
|
||||
if buf[..rest] != *tail {
|
||||
return Err(InvalidData.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(tag)
|
||||
}
|
|
@ -108,3 +108,25 @@ fn tokens() {
|
|||
assert_eq!(tok, token(xs));
|
||||
}
|
||||
}
|
||||
|
||||
pub use tag::Tag;
|
||||
mod tag;
|
||||
|
||||
tag::make! {
|
||||
/// These are the node tokens, succeeding [TOK_NAR] or [TOK_NOD],
|
||||
/// and preceding the next variable-length element.
|
||||
pub enum Node[16] {
|
||||
Sym = TOK_SYM,
|
||||
Reg = TOK_REG,
|
||||
Exe = TOK_EXE,
|
||||
Dir = TOK_DIR,
|
||||
}
|
||||
|
||||
/// Directory entry or terminator
|
||||
pub enum Entry[0] {
|
||||
/// End of directory
|
||||
None = TOK_PAR,
|
||||
/// Directory entry, which must be followed by [Node]
|
||||
Some = TOK_ENT,
|
||||
}
|
||||
}
|
165
tvix/nix-compat/src/nar/wire/tag.rs
Normal file
165
tvix/nix-compat/src/nar/wire/tag.rs
Normal file
|
@ -0,0 +1,165 @@
|
|||
/// A type implementing Tag represents a static hash set of byte strings,
|
||||
/// with a very simple perfect hash function: every element has a unique
|
||||
/// discriminant at a common byte offset. The values of the type represent
|
||||
/// the members by this single discriminant byte; they are indices into the
|
||||
/// hash set.
|
||||
pub trait Tag: Sized {
|
||||
/// Discriminant offset
|
||||
const OFF: usize;
|
||||
/// Minimum variant length
|
||||
const MIN: usize;
|
||||
|
||||
/// Minimal suitably sized buffer for reading the wire representation
|
||||
/// HACK: This is a workaround for const generics limitations.
|
||||
type Buf: AsMut<[u8]> + Send;
|
||||
|
||||
/// Make an instance of [Self::Buf]
|
||||
fn make_buf() -> Self::Buf;
|
||||
|
||||
/// Convert a discriminant into the corresponding variant
|
||||
fn from_u8(x: u8) -> Option<Self>;
|
||||
|
||||
/// Convert a variant back into the wire representation
|
||||
fn as_bytes(&self) -> &'static [u8];
|
||||
}
|
||||
|
||||
/// Generate an enum implementing [Tag], enforcing at compile time that
|
||||
/// the discriminant values are distinct.
|
||||
macro_rules! make {
|
||||
(
|
||||
$(
|
||||
$(#[doc = $doc:expr])*
|
||||
$vis:vis enum $Enum:ident[$off:expr] {
|
||||
$(
|
||||
$(#[doc = $var_doc:expr])*
|
||||
$Var:ident = $TOK:ident,
|
||||
)+
|
||||
}
|
||||
)*
|
||||
) => {
|
||||
$(
|
||||
$(#[doc = $doc])*
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[repr(u8)]
|
||||
$vis enum $Enum {
|
||||
$(
|
||||
$(#[doc = $var_doc])*
|
||||
$Var = $TOK[$Enum::OFF]
|
||||
),+
|
||||
}
|
||||
|
||||
impl Tag for $Enum {
|
||||
/// Discriminant offset
|
||||
const OFF: usize = $off;
|
||||
/// Minimum variant length
|
||||
const MIN: usize = tag::min_of(&[$($TOK.len()),+]);
|
||||
|
||||
/// Minimal suitably sized buffer for reading the wire representation
|
||||
type Buf = [u8; tag::buf_of(&[$($TOK.len()),+])];
|
||||
|
||||
/// Make an instance of [Self::Buf]
|
||||
#[inline(always)]
|
||||
fn make_buf() -> Self::Buf {
|
||||
[0u8; tag::buf_of(&[$($TOK.len()),+])]
|
||||
}
|
||||
|
||||
/// Convert a discriminant into the corresponding variant
|
||||
#[inline(always)]
|
||||
fn from_u8(x: u8) -> Option<Self> {
|
||||
#[allow(non_upper_case_globals)]
|
||||
mod __variant {
|
||||
$(
|
||||
pub const $Var: u8 = super::$Enum::$Var as u8;
|
||||
)+
|
||||
}
|
||||
|
||||
match x {
|
||||
$(__variant::$Var => Some(Self::$Var),)+
|
||||
_ => None
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a variant back into the wire representation
|
||||
#[inline(always)]
|
||||
fn as_bytes(&self) -> &'static [u8] {
|
||||
match self {
|
||||
$(Self::$Var => &$TOK,)+
|
||||
}
|
||||
}
|
||||
}
|
||||
)*
|
||||
};
|
||||
}
|
||||
|
||||
pub(crate) use make;
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::super::tag::{self, Tag};
|
||||
|
||||
const TOK_A: [u8; 3] = [0xed, 0xef, 0x1c];
|
||||
const TOK_B: [u8; 3] = [0xed, 0xf0, 0x1c];
|
||||
|
||||
const OFFSET: usize = 1;
|
||||
|
||||
make! {
|
||||
enum Token[OFFSET] {
|
||||
A = TOK_A,
|
||||
B = TOK_B,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn example() {
|
||||
assert_eq!(Token::from_u8(0xed), None);
|
||||
|
||||
let tag = Token::from_u8(0xef).unwrap();
|
||||
assert_eq!(tag.as_bytes(), &TOK_A[..]);
|
||||
|
||||
let tag = Token::from_u8(0xf0).unwrap();
|
||||
assert_eq!(tag.as_bytes(), &TOK_B[..]);
|
||||
}
|
||||
}
|
||||
|
||||
// The following functions are written somewhat unusually,
|
||||
// since they're const functions that cannot use iterators.
|
||||
|
||||
/// Maximum element of a slice
|
||||
const fn max_of(mut xs: &[usize]) -> usize {
|
||||
let mut y = usize::MIN;
|
||||
while let &[x, ref tail @ ..] = xs {
|
||||
y = if x > y { x } else { y };
|
||||
xs = tail;
|
||||
}
|
||||
y
|
||||
}
|
||||
|
||||
/// Minimum element of a slice
|
||||
pub const fn min_of(mut xs: &[usize]) -> usize {
|
||||
let mut y = usize::MAX;
|
||||
while let &[x, ref tail @ ..] = xs {
|
||||
y = if x < y { x } else { y };
|
||||
xs = tail;
|
||||
}
|
||||
y
|
||||
}
|
||||
|
||||
/// Minimum buffer size to contain either of `0..Tag::MIN` and `Tag::MIN..`
|
||||
/// at a particular time, for all possible tag wire representations, given
|
||||
/// the sizes of all wire representations.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```plain
|
||||
/// OFF = 16
|
||||
/// MIN = 24
|
||||
/// MAX = 64
|
||||
///
|
||||
/// BUF = max(MIN, MAX-MIN)
|
||||
/// = max(24, 64-24)
|
||||
/// = max(24, 40)
|
||||
/// = 40
|
||||
/// ```
|
||||
pub const fn buf_of(xs: &[usize]) -> usize {
|
||||
max_of(&[min_of(xs), max_of(xs) - min_of(xs)])
|
||||
}
|
|
@ -30,7 +30,7 @@
|
|||
//! # });
|
||||
//! ```
|
||||
|
||||
use super::wire;
|
||||
use crate::nar::wire;
|
||||
use bstr::ByteSlice;
|
||||
use futures_util::{AsyncBufRead, AsyncBufReadExt, AsyncWrite, AsyncWriteExt};
|
||||
use std::{
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
pub use sync::*;
|
||||
|
||||
mod wire;
|
||||
|
||||
pub mod sync;
|
||||
|
||||
#[cfg(feature = "async")]
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
//! # Ok::<(), std::io::Error>(())
|
||||
//! ```
|
||||
|
||||
use super::wire;
|
||||
use crate::nar::wire;
|
||||
use bstr::ByteSlice;
|
||||
use std::io::{
|
||||
self, BufRead,
|
||||
|
|
Loading…
Reference in a new issue