feat(tvix/nix-compat): NAR reader

Change-Id: I50d51baf62c0419eaf17f0dc262f728aaff9794d
Reviewed-on: https://cl.tvl.fyi/c/depot/+/9688
Reviewed-by: raitobezarius <tvl@lahfa.xyz>
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
edef 2023-10-15 14:59:39 +00:00
parent 08b98b7503
commit 6638f4d4ea
8 changed files with 530 additions and 4 deletions

View file

@ -1 +1,4 @@
mod wire;
pub mod reader;
pub mod writer;

View file

@ -0,0 +1,229 @@
//! Parser for the Nix archive format, aka NAR.
//!
//! NAR files (and their hashed representations) are used in C++ Nix for
//! a variety of things, including addressing fixed-output derivations
//! and transferring store paths between Nix stores.
use std::io::{
self,
ErrorKind::{InvalidData, UnexpectedEof},
Read,
};
// Required reading for understanding this module.
use crate::nar::wire;
mod read;
pub type Reader<'a> = dyn Read + Send + 'a;
/// Start reading a NAR file from `reader`.
pub fn open<'a, 'r>(reader: &'a mut Reader<'r>) -> io::Result<Node<'a, 'r>> {
read::token(reader, &wire::TOK_NAR)?;
Node::new(reader)
}
pub enum Node<'a, 'r> {
Symlink {
target: Vec<u8>,
},
File {
executable: bool,
reader: FileReader<'a, 'r>,
},
Directory(DirReader<'a, 'r>),
}
impl<'a, 'r> Node<'a, 'r> {
/// Start reading a [Node], matching the next [wire::Node].
///
/// Reading the terminating [wire::TOK_PAR] is done immediately for [Node::Symlink],
/// but is otherwise left to [DirReader] or [FileReader].
fn new(reader: &'a mut Reader<'r>) -> io::Result<Self> {
Ok(match read::tag(reader)? {
wire::Node::Sym => {
let target = read::bytes(reader, wire::MAX_TARGET_LEN)?;
if target.is_empty() || target.contains(&0) {
return Err(InvalidData.into());
}
read::token(reader, &wire::TOK_PAR)?;
Node::Symlink { target }
}
tag @ (wire::Node::Reg | wire::Node::Exe) => {
let len = read::u64(reader)?;
Node::File {
executable: tag == wire::Node::Exe,
reader: FileReader::new(reader, len)?,
}
}
wire::Node::Dir => Node::Directory(DirReader::new(reader)),
})
}
}
/// File contents, readable through the [Read] trait.
///
/// It comes with some caveats:
/// * You must always read the entire file, unless you intend to abandon the entire archive reader.
/// * You must abandon the entire archive reader upon the first error.
///
/// It's fine to read exactly `reader.len()` bytes without ever seeing an explicit EOF.
///
/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
pub struct FileReader<'a, 'r> {
reader: &'a mut Reader<'r>,
len: u64,
/// Truncated original file length for padding computation.
/// We only care about the 3 least significant bits; semantically, this is a u3.
pad: u8,
}
impl<'a, 'r> FileReader<'a, 'r> {
/// Instantiate a new reader, starting after [wire::TOK_REG] or [wire::TOK_EXE].
/// We handle the terminating [wire::TOK_PAR] on semantic EOF.
fn new(reader: &'a mut Reader<'r>, len: u64) -> io::Result<Self> {
// For zero-length files, we have to read the terminating TOK_PAR
// immediately, since FileReader::read may never be called; we've
// already reached semantic EOF by definition.
if len == 0 {
read::token(reader, &wire::TOK_PAR)?;
}
Ok(Self {
reader,
len,
pad: len as u8,
})
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
pub fn len(&self) -> u64 {
self.len
}
}
impl Read for FileReader<'_, '_> {
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
if buf.is_empty() || self.is_empty() {
return Ok(0);
}
if buf.len() as u64 > self.len {
buf = &mut buf[..self.len as usize];
}
let n = self.reader.read(buf)?;
self.len -= n as u64;
if n == 0 {
return Err(UnexpectedEof.into());
}
// If we've reached semantic EOF, consume and verify the padding and terminating TOK_PAR.
// Files are padded to 64 bits (8 bytes), just like any other byte string in the wire format.
if self.is_empty() {
let pad = (self.pad & 7) as usize;
if pad != 0 {
let mut buf = [0; 8];
self.reader.read_exact(&mut buf[pad..])?;
if buf != [0; 8] {
return Err(InvalidData.into());
}
}
read::token(self.reader, &wire::TOK_PAR)?;
}
Ok(n)
}
}
/// A directory iterator, yielding a sequence of [Node]s.
/// It must be fully consumed before reading further from the [DirReader] that produced it, if any.
pub struct DirReader<'a, 'r> {
reader: &'a mut Reader<'r>,
/// Previous directory entry name.
/// We have to hang onto this to enforce name monotonicity.
prev_name: Option<Vec<u8>>,
}
pub struct Entry<'a, 'r> {
pub name: Vec<u8>,
pub node: Node<'a, 'r>,
}
impl<'a, 'r> DirReader<'a, 'r> {
fn new(reader: &'a mut Reader<'r>) -> Self {
Self {
reader,
prev_name: None,
}
}
/// Read the next [Entry] from the directory.
///
/// We explicitly don't implement [Iterator], since treating this as
/// a regular Rust iterator will surely lead you astray.
///
/// * You must always consume the entire iterator, unless you abandon the entire archive reader.
/// * You must abandon the entire archive reader on the first error.
/// * You must abandon the directory reader upon the first [None].
/// * Even if you know the amount of elements up front, you must keep reading until you encounter [None].
///
/// TODO(edef): enforce these in `#[cfg(debug_assertions)]`
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> io::Result<Option<Entry>> {
// COME FROM the previous iteration: if we've already read an entry,
// read its terminating TOK_PAR here.
if self.prev_name.is_some() {
read::token(self.reader, &wire::TOK_PAR)?;
}
// Determine if there are more entries to follow
if let wire::Entry::None = read::tag(self.reader)? {
// We've reached the end of this directory.
return Ok(None);
}
let name = read::bytes(self.reader, wire::MAX_NAME_LEN)?;
if name.is_empty()
|| name.contains(&0)
|| name.contains(&b'/')
|| name == b"."
|| name == b".."
{
return Err(InvalidData.into());
}
// Enforce strict monotonicity of directory entry names.
match &mut self.prev_name {
None => {
self.prev_name = Some(name.clone());
}
Some(prev_name) => {
if *prev_name >= name {
return Err(InvalidData.into());
}
name[..].clone_into(prev_name);
}
}
read::token(self.reader, &wire::TOK_NOD)?;
Ok(Some(Entry {
name,
node: Node::new(&mut self.reader)?,
}))
}
}

View file

@ -0,0 +1,109 @@
//! Helpers for reading [crate::nar::wire] format.
use std::io::{
self,
ErrorKind::{Interrupted, InvalidData, UnexpectedEof},
};
use super::Reader;
use crate::nar::wire::Tag;
/// Consume a little-endian [u64] from the reader.
pub fn u64(reader: &mut Reader) -> io::Result<u64> {
let mut buf = [0; 8];
reader.read_exact(&mut buf)?;
Ok(u64::from_le_bytes(buf))
}
/// Consume a byte string of up to `max_len` bytes from the reader.
pub fn bytes(reader: &mut Reader, max_len: usize) -> io::Result<Vec<u8>> {
assert!(max_len <= isize::MAX as usize);
// read the length, and reject excessively large values
let len = self::u64(reader)?;
if len > max_len as u64 {
return Err(InvalidData.into());
}
// we know the length fits in a usize now
let len = len as usize;
// read the data and padding into a buffer
let buf_len = (len + 7) & !7;
let mut buf = vec![0; buf_len];
reader.read_exact(&mut buf)?;
// verify that the padding is all zeroes
for b in buf.drain(len..) {
if b != 0 {
return Err(InvalidData.into());
}
}
Ok(buf)
}
/// Consume a known token from the reader.
pub fn token<const N: usize>(reader: &mut Reader, token: &[u8; N]) -> io::Result<()> {
let mut buf = [0u8; N];
// This implements something similar to [Read::read_exact], but verifies that
// the input data matches the token while we read it. These two slices respectively
// represent the remaining token to be verified, and the remaining input buffer.
let mut token = &token[..];
let mut buf = &mut buf[..];
while !token.is_empty() {
match reader.read(buf) {
Ok(0) => {
return Err(UnexpectedEof.into());
}
Ok(n) => {
let (t, b);
(t, token) = token.split_at(n);
(b, buf) = buf.split_at_mut(n);
if t != b {
return Err(InvalidData.into());
}
}
Err(e) => {
if e.kind() != Interrupted {
return Err(e);
}
}
}
}
Ok(())
}
/// Consume a [Tag] from the reader.
pub fn tag<T: Tag>(reader: &mut Reader) -> io::Result<T> {
let mut buf = T::make_buf();
let buf = buf.as_mut();
// first read the known minimum length…
reader.read_exact(&mut buf[..T::MIN])?;
// then decide which tag we're expecting
let tag = T::from_u8(buf[T::OFF]).ok_or(InvalidData)?;
let (head, tail) = tag.as_bytes().split_at(T::MIN);
// make sure what we've read so far is valid
if buf[..T::MIN] != *head {
return Err(InvalidData.into());
}
// …then read the rest, if any
if !tail.is_empty() {
let rest = tail.len();
reader.read_exact(&mut buf[..rest])?;
// and make sure it's what we expect
if buf[..rest] != *tail {
return Err(InvalidData.into());
}
}
Ok(tag)
}

View file

@ -108,3 +108,25 @@ fn tokens() {
assert_eq!(tok, token(xs));
}
}
pub use tag::Tag;
mod tag;
tag::make! {
/// These are the node tokens, succeeding [TOK_NAR] or [TOK_NOD],
/// and preceding the next variable-length element.
pub enum Node[16] {
Sym = TOK_SYM,
Reg = TOK_REG,
Exe = TOK_EXE,
Dir = TOK_DIR,
}
/// Directory entry or terminator
pub enum Entry[0] {
/// End of directory
None = TOK_PAR,
/// Directory entry, which must be followed by [Node]
Some = TOK_ENT,
}
}

View file

@ -0,0 +1,165 @@
/// A type implementing Tag represents a static hash set of byte strings,
/// with a very simple perfect hash function: every element has a unique
/// discriminant at a common byte offset. The values of the type represent
/// the members by this single discriminant byte; they are indices into the
/// hash set.
pub trait Tag: Sized {
/// Discriminant offset
const OFF: usize;
/// Minimum variant length
const MIN: usize;
/// Minimal suitably sized buffer for reading the wire representation
/// HACK: This is a workaround for const generics limitations.
type Buf: AsMut<[u8]> + Send;
/// Make an instance of [Self::Buf]
fn make_buf() -> Self::Buf;
/// Convert a discriminant into the corresponding variant
fn from_u8(x: u8) -> Option<Self>;
/// Convert a variant back into the wire representation
fn as_bytes(&self) -> &'static [u8];
}
/// Generate an enum implementing [Tag], enforcing at compile time that
/// the discriminant values are distinct.
macro_rules! make {
(
$(
$(#[doc = $doc:expr])*
$vis:vis enum $Enum:ident[$off:expr] {
$(
$(#[doc = $var_doc:expr])*
$Var:ident = $TOK:ident,
)+
}
)*
) => {
$(
$(#[doc = $doc])*
#[derive(Debug, PartialEq, Eq)]
#[repr(u8)]
$vis enum $Enum {
$(
$(#[doc = $var_doc])*
$Var = $TOK[$Enum::OFF]
),+
}
impl Tag for $Enum {
/// Discriminant offset
const OFF: usize = $off;
/// Minimum variant length
const MIN: usize = tag::min_of(&[$($TOK.len()),+]);
/// Minimal suitably sized buffer for reading the wire representation
type Buf = [u8; tag::buf_of(&[$($TOK.len()),+])];
/// Make an instance of [Self::Buf]
#[inline(always)]
fn make_buf() -> Self::Buf {
[0u8; tag::buf_of(&[$($TOK.len()),+])]
}
/// Convert a discriminant into the corresponding variant
#[inline(always)]
fn from_u8(x: u8) -> Option<Self> {
#[allow(non_upper_case_globals)]
mod __variant {
$(
pub const $Var: u8 = super::$Enum::$Var as u8;
)+
}
match x {
$(__variant::$Var => Some(Self::$Var),)+
_ => None
}
}
/// Convert a variant back into the wire representation
#[inline(always)]
fn as_bytes(&self) -> &'static [u8] {
match self {
$(Self::$Var => &$TOK,)+
}
}
}
)*
};
}
pub(crate) use make;
#[cfg(test)]
mod test {
use super::super::tag::{self, Tag};
const TOK_A: [u8; 3] = [0xed, 0xef, 0x1c];
const TOK_B: [u8; 3] = [0xed, 0xf0, 0x1c];
const OFFSET: usize = 1;
make! {
enum Token[OFFSET] {
A = TOK_A,
B = TOK_B,
}
}
#[test]
fn example() {
assert_eq!(Token::from_u8(0xed), None);
let tag = Token::from_u8(0xef).unwrap();
assert_eq!(tag.as_bytes(), &TOK_A[..]);
let tag = Token::from_u8(0xf0).unwrap();
assert_eq!(tag.as_bytes(), &TOK_B[..]);
}
}
// The following functions are written somewhat unusually,
// since they're const functions that cannot use iterators.
/// Maximum element of a slice
const fn max_of(mut xs: &[usize]) -> usize {
let mut y = usize::MIN;
while let &[x, ref tail @ ..] = xs {
y = if x > y { x } else { y };
xs = tail;
}
y
}
/// Minimum element of a slice
pub const fn min_of(mut xs: &[usize]) -> usize {
let mut y = usize::MAX;
while let &[x, ref tail @ ..] = xs {
y = if x < y { x } else { y };
xs = tail;
}
y
}
/// Minimum buffer size to contain either of `0..Tag::MIN` and `Tag::MIN..`
/// at a particular time, for all possible tag wire representations, given
/// the sizes of all wire representations.
///
/// # Example
///
/// ```plain
/// OFF = 16
/// MIN = 24
/// MAX = 64
///
/// BUF = max(MIN, MAX-MIN)
/// = max(24, 64-24)
/// = max(24, 40)
/// = 40
/// ```
pub const fn buf_of(xs: &[usize]) -> usize {
max_of(&[min_of(xs), max_of(xs) - min_of(xs)])
}

View file

@ -30,7 +30,7 @@
//! # });
//! ```
use super::wire;
use crate::nar::wire;
use bstr::ByteSlice;
use futures_util::{AsyncBufRead, AsyncBufReadExt, AsyncWrite, AsyncWriteExt};
use std::{

View file

@ -1,7 +1,5 @@
pub use sync::*;
mod wire;
pub mod sync;
#[cfg(feature = "async")]

View file

@ -28,7 +28,7 @@
//! # Ok::<(), std::io::Error>(())
//! ```
use super::wire;
use crate::nar::wire;
use bstr::ByteSlice;
use std::io::{
self, BufRead,