feat(users/Profpatsch/netencode): fully streaming parser

In order to arbitrarily split netencode over multiple reads, we need
to make the parser completely streaming, so that it recognizes all
cases where it needs more input.

Luckily, this is fairly trivial, after working around a bunch of
overeager parsing.

The tricky part was the giant `alt`, where inner parsers would start
consuming input and thus become incomplete when they fail afterwards.
Sinc the format *always* starts the different types with one
discriminator char, we can use that to instantly return the parser and
try the next one instead.

The other tricky part was that lists and records would parse all inner
elements and then choke on the empty string after the last element,
because the inner parser would consume at least the descriminator, and
an empty string is always `Incomplete`. We wrap these into a small
combinator which plays nice with `many0` in that regard.

Change-Id: Ib8d15d9a7cab19d432c6b24a35fcad6a5a72b246
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2704
Tested-by: BuildkiteCI
Reviewed-by: Profpatsch <mail@profpatsch.de>
Reviewed-by: sterni <sternenseemann@systemli.org>
This commit is contained in:
Profpatsch 2021-03-30 04:49:43 +02:00
parent 53d8dd6a1e
commit 59a9955d75

View file

@ -210,9 +210,9 @@ pub mod parse {
use std::collections::HashMap; use std::collections::HashMap;
use nom::{IResult}; use nom::{IResult};
use nom::bytes::complete::{tag, take};
use nom::branch::{alt}; use nom::branch::{alt};
use nom::character::complete::{digit1, char}; use nom::bytes::streaming::{tag, take};
use nom::character::streaming::{digit1, char};
use nom::sequence::{tuple}; use nom::sequence::{tuple};
use nom::combinator::{map, map_res, flat_map, map_parser, opt}; use nom::combinator::{map, map_res, flat_map, map_parser, opt};
use nom::error::{context, ErrorKind, ParseError}; use nom::error::{context, ErrorKind, ParseError};
@ -233,8 +233,10 @@ pub mod parse {
fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> { fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
move |s: &[u8]| { move |s: &[u8]| {
let (s, (_, len, _)) = tuple(( // This is the point where we check the descriminator;
char(begin), // if the beginning char does not match, we can immediately return.
let (s, _) = char(begin)(s)?;
let (s, (len, _)) = tuple((
usize_t, usize_t,
char(':') char(':')
))(s)?; ))(s)?;
@ -344,14 +346,33 @@ pub mod parse {
list_g(t_t)(s) list_g(t_t)(s)
} }
/// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser
/// is not called when the `s` is empty already, preventing it from
/// returning `Incomplete` on streaming parsing.
fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O>
where
O: Clone,
P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
{
move |s: &'a [u8]| {
if s.is_empty() {
// This is a bit hacky, `many0` considers the inside done
// when a parser returns `Err::Error`, ignoring the actual error content
Err(nom::Err::Error((s, nom::error::ErrorKind::Many0)))
} else {
inner(s)
}
}
}
fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>> fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>>
where where
O: Clone, O: Clone,
P: Fn(&'a [u8]) -> IResult<&'a [u8], O> P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
{ {
map_parser( map_parser(
sized('[', ']'), sized('[', ']'),
nom::multi::many0(inner) nom::multi::many0(inner_no_empty_string(inner))
) )
} }
@ -368,21 +389,29 @@ pub mod parse {
O: Clone, O: Clone,
P: Fn(&'a [u8]) -> IResult<&'a [u8], O> P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
{ {
map_parser( move |s: &'a [u8]| {
sized('{', '}'), let (s, map) = map_parser(
nom::multi::fold_many1( sized('{', '}'),
tag_g(inner), nom::multi::fold_many0(
HashMap::new(), inner_no_empty_string(tag_g(&inner)),
|mut acc: HashMap<_,_>, Tag { tag, mut val }| { HashMap::new(),
// ignore duplicated tag names that appear later |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
// according to netencode spec // ignore duplicated tag names that appear later
if ! acc.contains_key(tag) { // according to netencode spec
acc.insert(tag, *val); if ! acc.contains_key(tag) {
acc.insert(tag, *val);
}
acc
} }
acc )
} )(s)?;
) if map.is_empty() {
) // records must not be empty, according to the spec
Err(nom::Err::Failure((s,nom::error::ErrorKind::Many1)))
} else {
Ok((s, map))
}
}
} }
pub fn u_u(s: &[u8]) -> IResult<&[u8], U> { pub fn u_u(s: &[u8]) -> IResult<&[u8], U> {
@ -512,16 +541,19 @@ pub mod parse {
fn test_parse_text() { fn test_parse_text() {
assert_eq!( assert_eq!(
text("t5:hello,".as_bytes()), text("t5:hello,".as_bytes()),
Ok(("".as_bytes(), T::Text("hello".to_owned()))) Ok(("".as_bytes(), T::Text("hello".to_owned()))),
"{}", r"t5:hello,"
); );
assert_eq!( assert_eq!(
text("t4:fo,".as_bytes()), text("t4:fo".as_bytes()),
// TODO: way better parse error messages // The content of the text should be 4 long
Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) Err(nom::Err::Incomplete(nom::Needed::Size(4))),
"{}", r"t4:fo,"
); );
assert_eq!( assert_eq!(
text("t9:今日は,".as_bytes()), text("t9:今日は,".as_bytes()),
Ok(("".as_bytes(), T::Text("今日は".to_owned()))) Ok(("".as_bytes(), T::Text("今日は".to_owned()))),
"{}", r"t9:今日は,"
); );
} }
@ -529,16 +561,25 @@ pub mod parse {
fn test_parse_binary() { fn test_parse_binary() {
assert_eq!( assert_eq!(
binary()("b5:hello,".as_bytes()), binary()("b5:hello,".as_bytes()),
Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))) Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))),
"{}", r"b5:hello,"
); );
assert_eq!( assert_eq!(
binary()("b4:fo,".as_bytes()), binary()("b4:fo".as_bytes()),
// TODO: way better parse error messages // The content of the byte should be 4 long
Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) Err(nom::Err::Incomplete(nom::Needed::Size(4))),
"{}", r"b4:fo,"
); );
assert_eq!(
binary()("b4:foob".as_bytes()),
// The content is 4 bytes now, but the finishing , is missing
Err(nom::Err::Incomplete(nom::Needed::Size(1))),
"{}", r"b4:fo,"
);
assert_eq!( assert_eq!(
binary()("b9:今日は,".as_bytes()), binary()("b9:今日は,".as_bytes()),
Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))) Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))),
"{}", r"b9:今日は,"
); );
} }
@ -546,7 +587,8 @@ pub mod parse {
fn test_list() { fn test_list() {
assert_eq!( assert_eq!(
list_t("[0:]".as_bytes()), list_t("[0:]".as_bytes()),
Ok(("".as_bytes(), vec![])) Ok(("".as_bytes(), vec![])),
"{}", r"[0:]"
); );
assert_eq!( assert_eq!(
list_t("[6:u,u,u,]".as_bytes()), list_t("[6:u,u,u,]".as_bytes()),
@ -554,7 +596,8 @@ pub mod parse {
T::Unit, T::Unit,
T::Unit, T::Unit,
T::Unit, T::Unit,
])) ])),
"{}", r"[6:u,u,u,]"
); );
assert_eq!( assert_eq!(
list_t("[15:u,[7:t3:foo,]u,]".as_bytes()), list_t("[15:u,[7:t3:foo,]u,]".as_bytes()),
@ -562,7 +605,8 @@ pub mod parse {
T::Unit, T::Unit,
T::List(vec![T::Text("foo".to_owned())]), T::List(vec![T::Text("foo".to_owned())]),
T::Unit, T::Unit,
])) ])),
"{}", r"[15:u,[7:t3:foo,]u,]"
); );
} }
@ -574,7 +618,8 @@ pub mod parse {
("a".to_owned(), T::Unit), ("a".to_owned(), T::Unit),
("b".to_owned(), T::Unit), ("b".to_owned(), T::Unit),
("c".to_owned(), T::Unit), ("c".to_owned(), T::Unit),
].into_iter().collect::<HashMap<String, T>>())) ].into_iter().collect::<HashMap<String, T>>())),
"{}", r"{21:<1:a|u,<1:b|u,<1:c|u,}"
); );
// duplicated keys are ignored (first is taken) // duplicated keys are ignored (first is taken)
assert_eq!( assert_eq!(
@ -582,7 +627,14 @@ pub mod parse {
Ok(("".as_bytes(), vec![ Ok(("".as_bytes(), vec![
("a".to_owned(), T::Unit), ("a".to_owned(), T::Unit),
("b".to_owned(), T::Unit), ("b".to_owned(), T::Unit),
].into_iter().collect::<HashMap<_,_>>())) ].into_iter().collect::<HashMap<_,_>>())),
"{}", r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}"
);
// empty records are not allowed
assert_eq!(
record_t("{0:}".as_bytes()),
Err(nom::Err::Failure(("".as_bytes(), nom::error::ErrorKind::Many1))),
"{}", r"{0:}"
); );
} }
@ -590,18 +642,21 @@ pub mod parse {
fn test_parse() { fn test_parse() {
assert_eq!( assert_eq!(
t_t("n3:255,".as_bytes()), t_t("n3:255,".as_bytes()),
Ok(("".as_bytes(), T::N3(255))) Ok(("".as_bytes(), T::N3(255))),
"{}", r"n3:255,"
); );
assert_eq!( assert_eq!(
t_t("t6:halloo,".as_bytes()), t_t("t6:halloo,".as_bytes()),
Ok(("".as_bytes(), T::Text("halloo".to_owned()))) Ok(("".as_bytes(), T::Text("halloo".to_owned()))),
"{}", r"t6:halloo,"
); );
assert_eq!( assert_eq!(
t_t("<3:foo|t6:halloo,".as_bytes()), t_t("<3:foo|t6:halloo,".as_bytes()),
Ok(("".as_bytes(), T::Sum (Tag { Ok(("".as_bytes(), T::Sum (Tag {
tag: "foo".to_owned(), tag: "foo".to_owned(),
val: Box::new(T::Text("halloo".to_owned())) val: Box::new(T::Text("halloo".to_owned()))
}))) }))),
"{}", r"<3:foo|t6:halloo,"
); );
// { a: Unit // { a: Unit
// , foo: List <A: Unit | B: List i3> } // , foo: List <A: Unit | B: List i3> }
@ -614,7 +669,8 @@ pub mod parse {
T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }), T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }),
T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }), T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }),
])) ]))
].into_iter().collect::<HashMap<String, T>>()))) ].into_iter().collect::<HashMap<String, T>>()))),
"{}", r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}"
); );
} }