feat(Profpatsch): dump netencode spec & parser

The netencode standard, a no-nonsense extension of netstrings for
structured data.

Includes a nix generator module and a rust parsing library.

Imported from
e409df3861/pkgs/profpatsch/netencode

Original license GPLv3, but I’m the sole author, so I transfer it to
whatever license depot uses.

Change-Id: I4f6fa97120a0fd861eeef35085a3dd642ab7c407
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2319
Tested-by: BuildkiteCI
Reviewed-by: Profpatsch <mail@profpatsch.de>
This commit is contained in:
Profpatsch 2021-01-02 14:59:23 +01:00
parent 1261616bff
commit f1c38e2560
4 changed files with 761 additions and 0 deletions

View file

@ -0,0 +1,48 @@
{ pkgs, depot, ... }:
let
imports = {
inherit (depot.users.Profpatsch)
writers;
};
version-check = pkgs.buildRustCrate {
pname = "version-check";
version = "0.9.2";
crateName = "version-check";
sha256 = "1vwvc1mzwv8ana9jv8z933p2xzgj1533qwwl5zr8mi89azyhq21v";
};
memchr = pkgs.buildRustCrate {
pname = "memchr";
version = "2.3.3";
crateName = "memchr";
sha256 = "1ivxvlswglk6wd46gadkbbsknr94gwryk6y21v64ja7x4icrpihw";
};
nom = pkgs.buildRustCrate {
pname = "nom";
version = "5.1.1";
crateName = "nom";
sha256 = "1gb4r6mjwd645jqh02nhn60i7qkw8cgy3xq1r4clnmvz3cmkv1l0";
dependencies = [ memchr ];
buildDependencies = [ version-check ];
features = [ "std" "alloc" ];
};
netencode-rs-common = tests: imports.writers.rustSimpleLib {
name = "netencode";
dependencies = [ nom ];
buildTests = tests;
release = false;
verbose = true;
} (builtins.readFile ./netencode.rs);
netencode-rs-tests = netencode-rs-common true;
netencode-rs = netencode-rs-common false;
in {
inherit
netencode-rs
netencode-rs-tests
;
}

View file

@ -0,0 +1,49 @@
let
netstring = tag: suffix: s:
"${tag}${toString (builtins.stringLength s)}:${s}${suffix}";
unit = "u,";
n1 = b: if b then "n1:1," else "n1:0,";
n = i: n: netstring "n${toString i}" "," (toString n);
i = i: n: netstring "i${toString i}" "," (toString n);
n3 = n 3;
n6 = n 6;
n7 = n 7;
i3 = i 3;
i6 = i 6;
i7 = i 7;
text = netstring "t" ",";
binary = netstring "b" ",";
tag = key: val: netstring "<" "|" key + val;
concatStrings = builtins.concatStringsSep "";
record = lokv: netstring "{" "}"
(concatStrings (map (kv: tag kv.key kv.val) lokv));
list = l: netstring "[" "]" (concatStrings l);
in {
inherit
unit
n1
n3
n6
n7
i3
i6
i7
text
binary
tag
record
list
;
}

View file

@ -0,0 +1,553 @@
extern crate nom;
use std::collections::HashMap;
use std::io::Write;
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum T {
// Unit
Unit,
// Boolean
N1(bool),
// Naturals
N3(u8),
N6(u64),
N7(u128),
// Integers
I3(i8),
I6(i64),
I7(i128),
// Text
// TODO: make into &str
Text(String),
Binary(Vec<u8>),
// Tags
// TODO: make into &str
Sum(Tag<String, Box<T>>),
// TODO: make into &str
Record(HashMap<String, Box<T>>),
List(Box<Vec<T>>),
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum U<'a> {
Unit,
// Boolean
N1(bool),
// Naturals
N3(u8),
N6(u64),
N7(u128),
// Integers
I3(i8),
I6(i64),
I7(i128),
// Text
Text(&'a [u8]),
Binary(&'a [u8]),
// Tags
Sum(Tag<&'a str, Box<U<'a>>>),
Record(Vec<(&'a str, Box<U<'a>>)>),
List(&'a [u8]),
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub struct Tag<S, A> {
// TODO: make into &str
pub tag: S,
pub val: A
}
impl<S, A> Tag<S, A> {
fn map<F, B>(self, f: F) -> Tag<S, B>
where F: Fn(A) -> B {
Tag {
tag: self.tag,
val: f(self.val)
}
}
}
fn encode_tag<W: Write>(w: &mut W, tag: &str, val: U) -> std::io::Result<()> {
write!(w, "<{}:{}|", tag.len(), tag)?;
encode(w, val)?;
Ok(())
}
pub fn encode<W: Write>(w: &mut W, u: U) -> std::io::Result<()> {
match u {
U::Unit => write!(w, "u,"),
U::N1(b) => if b { write!(w, "n1:1,") } else { write!(w, "n1:0,") },
U::N3(n) => write!(w, "n3:{},", n),
U::N6(n) => write!(w, "n6:{},", n),
U::N7(n) => write!(w, "n7:{},", n),
U::I3(i) => write!(w, "i3:{},", i),
U::I6(i) => write!(w, "i6:{},", i),
U::I7(i) => write!(w, "i7:{},", i),
U::Text(s) => {
write!(w, "t{}:", s.len());
w.write(&s);
write!(w, ",")
}
U::Binary(s) => {
write!(w, "b{}:", s.len());
w.write(&s);
write!(w, ",")
},
U::Sum(Tag{tag, val}) => encode_tag(w, tag, *val),
U::Record(m) => {
let mut c = std::io::Cursor::new(vec![]);
for (k, v) in m {
encode_tag(&mut c, k, *v)?;
}
write!(w, "{{{}:", c.get_ref().len())?;
w.write(c.get_ref())?;
write!(w, "}}")
},
U::List(l) => {
write!(w, "[{}:", l.len())?;
w.write(l)?;
write!(w, "]")
}
}
}
pub fn text(s: String) -> T {
T::Text(s)
}
pub mod parse {
use super::{T, Tag, U};
use std::str::FromStr;
use std::ops::Neg;
use std::collections::HashMap;
use nom::{IResult};
use nom::bytes::complete::{tag, take};
use nom::branch::{alt};
use nom::character::complete::{digit1, char};
use nom::sequence::{tuple};
use nom::combinator::{map, map_res, flat_map, map_parser, opt};
use nom::error::{context, ErrorKind, ParseError};
fn unit_t(s: &[u8]) -> IResult<&[u8], ()> {
let (s, _) = context("unit", tag("u,"))(s)?;
Ok((s, ()))
}
fn usize_t(s: &[u8]) -> IResult<&[u8], usize> {
context(
"usize",
map_res(
map_res(digit1, |n| std::str::from_utf8(n)),
|s| s.parse::<usize>())
)(s)
}
fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
move |s: &[u8]| {
let (s, (_, len, _)) = tuple((
char(begin),
usize_t,
char(':')
))(s)?;
let (s, (res, _)) = tuple((
take(len),
char(end)
))(s)?;
Ok((s, res))
}
}
fn uint_t<'a, I: FromStr + 'a>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], I> {
move |s: &'a [u8]| {
let (s, (_, _, int, _)) = tuple((
tag(t.as_bytes()),
char(':'),
map_res(
map_res(digit1, |n: &[u8]| std::str::from_utf8(n)),
|s| s.parse::<I>()
),
char(',')
))(s)?;
Ok((s, int))
}
}
fn bool_t<'a>() -> impl Fn(&'a [u8]) -> IResult<&'a [u8], bool> {
context("bool", alt((
map(tag("n1:0,"), |_| false),
map(tag("n1:1,"), |_| true),
)))
}
fn int_t<'a, I: FromStr + Neg<Output=I>>(t: &'static str) -> impl Fn(&'a [u8]) -> IResult<&[u8], I> {
context(
t,
move |s: &'a [u8]| {
let (s, (_, _, neg, int, _)) = tuple((
tag(t.as_bytes()),
char(':'),
opt(char('-')),
map_res(
map_res(digit1, |n: &[u8]| std::str::from_utf8(n)),
|s| s.parse::<I>()
),
char(',')
))(s)?;
let res = match neg {
Some(_) => -int,
None => int,
};
Ok((s, res))
}
)
}
fn tag_t(s: &[u8]) -> IResult<&[u8], Tag<String, Box<T>>> {
// recurses into the main parser
map(tag_g(t_t),
|Tag {tag, val}|
Tag {
tag: tag.to_string(),
val: Box::new(val)
})(s)
}
fn tag_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Tag<&'a str, O>>
where
P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
{
move |s: &[u8]| {
let (s, tag) = sized('<', '|')(s)?;
let (s, val) = inner(s)?;
Ok((s, Tag {
tag: std::str::from_utf8(tag)
.map_err(|_| nom::Err::Failure((s, ErrorKind::Char)))?,
val
}))
}
}
/// parse text scalar (`t5:hello,`)
fn text(s: &[u8]) -> IResult<&[u8], T> {
let (s, res) = text_g()(s)?;
Ok((s, T::Text(
std::str::from_utf8(res)
.map_err(|_| nom::Err::Failure((s, ErrorKind::Char)))
.map(|s| s.to_string())?,
)))
}
fn text_g() -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
sized('t', ',')
}
fn binary<'a>() -> impl Fn(&'a [u8]) -> IResult<&'a [u8], T> {
map(binary_g(), |b| T::Binary(b.to_owned()))
}
fn binary_g() -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
sized('b', ',')
}
fn list_t(s: &[u8]) -> IResult<&[u8], Vec<T>> {
map_parser(list_g(), nom::multi::many0(t_t))(s)
}
fn list_g() -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
sized('[', ']')
}
fn skip() -> impl Fn(&[u8]) -> IResult<&[u8], ()> {
move |s: &[u8]| {
let (s, ()) = alt((
// TODO: only use the sized parsers here
map(text, |_| ()),
map(unit_t, |_| ()),
map(list_g(), |_| ()),
map(t_t, |_| ()),
// TODO: add rest of parsers
))(s)?;
Ok((s, ()))
}
}
fn list_take<'a>(n: usize) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<U<'a>>> {
map_parser(list_g(), nom::multi::many_m_n(n, n, u_u))
}
fn record_t<'a>(s: &'a [u8]) -> IResult<&'a [u8], HashMap<String, Box<T>>> {
let (s, r) = record_g(t_t)(s)?;
Ok((s,
r.into_iter()
// ignore duplicated tag names that appear later
// by reverting the vector now
.rev()
.map(|(k, v)| (k.to_string(), v))
.collect::<HashMap<_,_>>()))
}
fn record_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<(&'a str, Box<O>)>>
where
O: Clone,
P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
{
map_parser(
sized('{', '}'),
nom::multi::fold_many1(
tag_g(inner),
Vec::new(),
|mut acc: Vec<_>, Tag { tag, mut val }| {
acc.push((tag, Box::new(val)));
acc
}
)
)
}
pub fn u_u(s: &[u8]) -> IResult<&[u8], U> {
alt((
map(text_g(), U::Text),
map(binary_g(), U::Binary),
map(unit_t, |()| U::Unit),
map(tag_g(u_u), |t| U::Sum(t.map(Box::new))),
map(list_g(), U::List),
map(record_g(u_u), U::Record),
map(bool_t(), |u| U::N1(u)),
map(uint_t("n3"), |u| U::N3(u)),
map(uint_t("n6"), |u| U::N6(u)),
map(uint_t("n7"), |u| U::N7(u)),
map(int_t("i3"), |u| U::I3(u)),
map(int_t("i6"), |u| U::I6(u)),
map(int_t("i7"), |u| U::I7(u)),
// less common
map(uint_t("n2"), |u| U::N3(u)),
map(uint_t("n4"), |u| U::N6(u)),
map(uint_t("n5"), |u| U::N6(u)),
map(int_t("i1"), |u| U::I3(u)),
map(int_t("i2"), |u| U::I3(u)),
map(int_t("i4"), |u| U::I6(u)),
map(int_t("i5"), |u| U::I6(u)),
// TODO: 8, 9 not supported
))(s)
}
fn t_t(s: &[u8]) -> IResult<&[u8], T> {
alt((
text,
binary(),
map(unit_t, |_| T::Unit),
map(tag_t, |t| T::Sum(t)),
map(list_t, |l| T::List(Box::new(l))),
map(record_t, |p| T::Record(p)),
map(bool_t(), |u| T::N1(u)),
// 8, 64 and 128 bit
map(uint_t("n3"), |u| T::N3(u)),
map(uint_t("n6"), |u| T::N6(u)),
map(uint_t("n7"), |u| T::N7(u)),
map(int_t("i3"), |u| T::I3(u)),
map(int_t("i6"), |u| T::I6(u)),
map(int_t("i7"), |u| T::I7(u)),
// less common
map(uint_t("n2"), |u| T::N3(u)),
map(uint_t("n4"), |u| T::N6(u)),
map(uint_t("n5"), |u| T::N6(u)),
map(int_t("i1"), |u| T::I3(u)),
map(int_t("i2"), |u| T::I3(u)),
map(int_t("i4"), |u| T::I6(u)),
map(int_t("i5"), |u| T::I6(u)),
// TODO: 8, 9 not supported
))(s)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_unit_t() {
assert_eq!(
unit_t("u,".as_bytes()),
Ok(("".as_bytes(), ()))
);
}
#[test]
fn test_parse_bool_t() {
assert_eq!(
bool_t()("n1:0,".as_bytes()),
Ok(("".as_bytes(), false))
);
assert_eq!(
bool_t()("n1:1,".as_bytes()),
Ok(("".as_bytes(), true))
);
}
#[test]
fn test_parse_usize_t() {
assert_eq!(
usize_t("32foo".as_bytes()),
Ok(("foo".as_bytes(), 32))
);
}
#[test]
fn test_parse_int_t() {
assert_eq!(
uint_t::<u8>("n3")("n3:42,abc".as_bytes()),
Ok(("abc".as_bytes(), 42))
);
assert_eq!(
uint_t::<u8>("n3")("n3:1024,abc".as_bytes()),
Err(nom::Err::Error(("1024,abc".as_bytes(), nom::error::ErrorKind::MapRes)))
);
assert_eq!(
int_t::<i64>("i6")("i6:-23,abc".as_bytes()),
Ok(("abc".as_bytes(), -23))
);
assert_eq!(
int_t::<i128>("i3")("i3:0,:abc".as_bytes()),
Ok((":abc".as_bytes(), 0))
);
assert_eq!(
uint_t::<u8>("n7")("n7:09,".as_bytes()),
Ok(("".as_bytes(), 9))
);
// assert_eq!(
// length("c"),
// Err(nom::Err::Error(("c", nom::error::ErrorKind::Digit)))
// );
// assert_eq!(
// length(":"),
// Err(nom::Err::Error((":", nom::error::ErrorKind::Digit)))
// );
}
#[test]
fn test_parse_text() {
assert_eq!(
text("t5:hello,".as_bytes()),
Ok(("".as_bytes(), T::Text("hello".to_owned())))
);
assert_eq!(
text("t4:fo,".as_bytes()),
// TODO: way better parse error messages
Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
);
assert_eq!(
text("t9:今日は,".as_bytes()),
Ok(("".as_bytes(), T::Text("今日は".to_owned())))
);
}
#[test]
fn test_parse_binary() {
assert_eq!(
binary()("b5:hello,".as_bytes()),
Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned()))))
);
assert_eq!(
binary()("b4:fo,".as_bytes()),
// TODO: way better parse error messages
Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
);
assert_eq!(
binary()("b9:今日は,".as_bytes()),
Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes()))))
);
}
#[test]
fn test_list() {
assert_eq!(
list_t("[0:]".as_bytes()),
Ok(("".as_bytes(), vec![]))
);
assert_eq!(
list_t("[6:u,u,u,]".as_bytes()),
Ok(("".as_bytes(), vec![
T::Unit,
T::Unit,
T::Unit,
]))
);
assert_eq!(
list_take(2)("[6:u,u,u,]".as_bytes()),
Ok(("".as_bytes(), vec![
U::Unit,
U::Unit,
]))
);
assert_eq!(
list_t("[15:u,[7:t3:foo,]u,]".as_bytes()),
Ok(("".as_bytes(), vec![
T::Unit,
T::List(Box::new(vec![T::Text("foo".to_owned())])),
T::Unit,
]))
);
}
#[test]
fn test_record() {
assert_eq!(
record_t("{21:<1:a|u,<1:b|u,<1:c|u,}".as_bytes()),
Ok(("".as_bytes(), vec![
("a".to_owned(), Box::new(T::Unit)),
("b".to_owned(), Box::new(T::Unit)),
("c".to_owned(), Box::new(T::Unit)),
].into_iter().collect::<HashMap<String, Box<T>>>()))
);
// duplicated keys are ignored (first is taken)
assert_eq!(
record_t("{25:<1:a|u,<1:b|u,<1:a|i1:-1,}".as_bytes()),
Ok(("".as_bytes(), vec![
("a".to_owned(), Box::new(T::Unit)),
("b".to_owned(), Box::new(T::Unit)),
].into_iter().collect::<HashMap<_,_>>()))
);
}
#[test]
fn test_parse() {
assert_eq!(
t_t("n3:255,".as_bytes()),
Ok(("".as_bytes(), T::N3(255)))
);
assert_eq!(
t_t("t6:halloo,".as_bytes()),
Ok(("".as_bytes(), T::Text("halloo".to_owned())))
);
assert_eq!(
t_t("<3:foo|t6:halloo,".as_bytes()),
Ok(("".as_bytes(), T::Sum (Tag {
tag: "foo".to_owned(),
val: Box::new(T::Text("halloo".to_owned()))
})))
);
// { a: Unit
// , foo: List <A: Unit | B: List i3> }
assert_eq!(
t_t("{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}".as_bytes()),
Ok(("".as_bytes(), T::Record(vec![
("a".to_owned(), Box::new(T::Unit)),
("foo".to_owned(), Box::new(T::List(Box::new(vec![
T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::Unit) }),
T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }),
T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(Box::new(vec![T::I3(127)]))) }),
]))))
].into_iter().collect::<HashMap<String, Box<T>>>())))
);
}
}
}

View file

@ -0,0 +1,111 @@
# netencode 0.1-unreleased
[bencode][] and [netstring][]-inspired pipe format that should be trivial go generate correctly in every context (only requires a `byte_length()` and a `printf()`), easy to parse (100 lines of code or less), mostly human-decipherable for easy debugging, and support nested record and sum types.
## scalars
Scalars have the format `[type prefix][size]:[value],`.
where size is a natural number without leading zeroes.
### unit
The unit (`u`) has only one value.
* The unit is: `u,`
### numbers
Naturals (`n`) and Integers (`i`), with a maximum size in bits.
Bit sizes are specified in 2^n increments, 1 to 9 (`n1`..`n9`, `i1`..`n9`).
* Natural `1234` that fits in 32 bits (2^5): `n5:1234,`
* Integer `-42` that fits in 8 bits (2^3): `i3:-42,`
* Integer `23` that fits in 64 bits (2^6): `i6:23,`
* Integer `-1` that fits in 512 bits (2^9): `i9:-1,`
* Natural `0` that fits in 1 bit (2^1): `n1:0,`
An implementation can define the biggest numbers it supports, and has to throw an error for anything bigger. It has to support everything smaller, so for example if you support up to i6/n6, you have to support 16 as well. An implementation could support up to the current architectures wordsize for example.
Floats are not supported, you can implement fixed-size decimals or ratios using integers.
### booleans
A boolean is represented as `n1`.
* `n1:0,`: false
* `n1:1,`: true
TODO: should we add `f,` and `t,`?
### text
Text (`t`) that *must* be encoded as UTF-8, starting with its length in bytes:
* The string `hello world` (11 bytes): `t11:hello world,`
* The string `今日は` (9 bytes): `t9:今日は,`
* The string `:,` (2 bytes): `t2::,,`
* The empty sting `` (0 bytes): `t0:,`
### binary
Arbitrary binary strings (`b`) that can contain any data, starting with its length in bytes.
* The ASCII string `hello world` as binary data (11 bytes): `b11:hello world,`
* The empty binary string (0 bytes): `b0:,`
* The bytestring with `^D` (1 byte): `b1:,`
Since the binary strings are length-prefixd, they can contain `\0` and no escaping is required. Care has to be taken in languages with `\0`-terminated bytestrings.
Use text (`t`) if you have utf-8 encoded data.
## tagged values
### tags
A tag (`<`) gives a value a name. The tag is UTF-8 encoded, starting with its length in bytes and proceeding with the value.
* The tag `foo` (3 bytes) tagging the text `hello` (5 bytes): `<3:foo|t5:hello,`
* The tag `` (0 bytes) tagging the 8-bit integer 0: `<0:|i3:0,`
### records (products/records), also maps
A record (`{`) is a concatenation of tags (`<`). It needs to be closed with `}`.
If tag names repeat the later ones should be ignored. Ordering does not matter.
Similar to text, records start with the length of their *whole encoded content*, in bytes. This makes it possible to treat their contents as opaque bytestrings.
* There is no empty record. (TODO: make the empty record the unit type, remove `u,`?)
* A record with one empty field, `foo`: `{9:<3:foo|u,}`
* A record with two fields, `foo` and `x`: `{21:<3:foo|u,<1:x|t3:baz,}`
* The same record: `{21:<1:x|t3:baz,<3:foo|u,}`
* The same record (later occurences of fields are ignored): `{28:<1:x|t3:baz,<3:foo|u,<1:x|u,}`
### sums (tagged unions)
Simply a tagged value. The tag marker `<` indicates it is a sum if it appears outside of a record.
## lists
A list (`[`) imposes an ordering on a sequence of values. It needs to be closed with `]`. Values in it are simply concatenated.
Similar to records, lists start with the length of their whole encoded content.
* The empty list: `[0:]`
* The list with one element, the string `foo`: `[7:t3:foo,]`
* The list with text `foo` followed by i3 `-42`: `[14:t3:foo,i3:-42,]`
* The list with `Some` and `None` tags: `[33:<4:Some|t3:foo,<4None|u,<4None|u,]`
## motivation
TODO
## guarantees
TODO: do I want unique representation (bijection like bencode?) This would put more restrictions on the generator, like sorting records in lexicographic order, but would make it possible to compare without decoding
[bencode]: https://en.wikipedia.org/wiki/Bencode
[netstring]: https://en.wikipedia.org/wiki/Netstring