From 59a9955d753d8f9deb705d36922f6e8d77307f1d Mon Sep 17 00:00:00 2001 From: Profpatsch Date: Tue, 30 Mar 2021 04:49:43 +0200 Subject: [PATCH] feat(users/Profpatsch/netencode): fully streaming parser In order to arbitrarily split netencode over multiple reads, we need to make the parser completely streaming, so that it recognizes all cases where it needs more input. Luckily, this is fairly trivial, after working around a bunch of overeager parsing. The tricky part was the giant `alt`, where inner parsers would start consuming input and thus become incomplete when they fail afterwards. Sinc the format *always* starts the different types with one discriminator char, we can use that to instantly return the parser and try the next one instead. The other tricky part was that lists and records would parse all inner elements and then choke on the empty string after the last element, because the inner parser would consume at least the descriminator, and an empty string is always `Incomplete`. We wrap these into a small combinator which plays nice with `many0` in that regard. Change-Id: Ib8d15d9a7cab19d432c6b24a35fcad6a5a72b246 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2704 Tested-by: BuildkiteCI Reviewed-by: Profpatsch Reviewed-by: sterni --- users/Profpatsch/netencode/netencode.rs | 134 +++++++++++++++++------- 1 file changed, 95 insertions(+), 39 deletions(-) diff --git a/users/Profpatsch/netencode/netencode.rs b/users/Profpatsch/netencode/netencode.rs index 280032609..249cc33ed 100644 --- a/users/Profpatsch/netencode/netencode.rs +++ b/users/Profpatsch/netencode/netencode.rs @@ -210,9 +210,9 @@ pub mod parse { use std::collections::HashMap; use nom::{IResult}; - use nom::bytes::complete::{tag, take}; use nom::branch::{alt}; - use nom::character::complete::{digit1, char}; + use nom::bytes::streaming::{tag, take}; + use nom::character::streaming::{digit1, char}; use nom::sequence::{tuple}; use nom::combinator::{map, map_res, flat_map, map_parser, opt}; use nom::error::{context, ErrorKind, ParseError}; @@ -233,8 +233,10 @@ pub mod parse { fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> { move |s: &[u8]| { - let (s, (_, len, _)) = tuple(( - char(begin), + // This is the point where we check the descriminator; + // if the beginning char does not match, we can immediately return. + let (s, _) = char(begin)(s)?; + let (s, (len, _)) = tuple(( usize_t, char(':') ))(s)?; @@ -344,14 +346,33 @@ pub mod parse { list_g(t_t)(s) } + /// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser + /// is not called when the `s` is empty already, preventing it from + /// returning `Incomplete` on streaming parsing. + fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O> + where + O: Clone, + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, + { + move |s: &'a [u8]| { + if s.is_empty() { + // This is a bit hacky, `many0` considers the inside done + // when a parser returns `Err::Error`, ignoring the actual error content + Err(nom::Err::Error((s, nom::error::ErrorKind::Many0))) + } else { + inner(s) + } + } + } + fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec> where O: Clone, - P: Fn(&'a [u8]) -> IResult<&'a [u8], O> + P: Fn(&'a [u8]) -> IResult<&'a [u8], O>, { map_parser( sized('[', ']'), - nom::multi::many0(inner) + nom::multi::many0(inner_no_empty_string(inner)) ) } @@ -368,21 +389,29 @@ pub mod parse { O: Clone, P: Fn(&'a [u8]) -> IResult<&'a [u8], O> { - map_parser( - sized('{', '}'), - nom::multi::fold_many1( - tag_g(inner), - HashMap::new(), - |mut acc: HashMap<_,_>, Tag { tag, mut val }| { - // ignore duplicated tag names that appear later - // according to netencode spec - if ! acc.contains_key(tag) { - acc.insert(tag, *val); + move |s: &'a [u8]| { + let (s, map) = map_parser( + sized('{', '}'), + nom::multi::fold_many0( + inner_no_empty_string(tag_g(&inner)), + HashMap::new(), + |mut acc: HashMap<_,_>, Tag { tag, mut val }| { + // ignore duplicated tag names that appear later + // according to netencode spec + if ! acc.contains_key(tag) { + acc.insert(tag, *val); + } + acc } - acc - } - ) - ) + ) + )(s)?; + if map.is_empty() { + // records must not be empty, according to the spec + Err(nom::Err::Failure((s,nom::error::ErrorKind::Many1))) + } else { + Ok((s, map)) + } + } } pub fn u_u(s: &[u8]) -> IResult<&[u8], U> { @@ -512,16 +541,19 @@ pub mod parse { fn test_parse_text() { assert_eq!( text("t5:hello,".as_bytes()), - Ok(("".as_bytes(), T::Text("hello".to_owned()))) + Ok(("".as_bytes(), T::Text("hello".to_owned()))), + "{}", r"t5:hello," ); assert_eq!( - text("t4:fo,".as_bytes()), - // TODO: way better parse error messages - Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) + text("t4:fo".as_bytes()), + // The content of the text should be 4 long + Err(nom::Err::Incomplete(nom::Needed::Size(4))), + "{}", r"t4:fo," ); assert_eq!( text("t9:今日は,".as_bytes()), - Ok(("".as_bytes(), T::Text("今日は".to_owned()))) + Ok(("".as_bytes(), T::Text("今日は".to_owned()))), + "{}", r"t9:今日は," ); } @@ -529,16 +561,25 @@ pub mod parse { fn test_parse_binary() { assert_eq!( binary()("b5:hello,".as_bytes()), - Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))) + Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))), + "{}", r"b5:hello," ); assert_eq!( - binary()("b4:fo,".as_bytes()), - // TODO: way better parse error messages - Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof))) + binary()("b4:fo".as_bytes()), + // The content of the byte should be 4 long + Err(nom::Err::Incomplete(nom::Needed::Size(4))), + "{}", r"b4:fo," ); + assert_eq!( + binary()("b4:foob".as_bytes()), + // The content is 4 bytes now, but the finishing , is missing + Err(nom::Err::Incomplete(nom::Needed::Size(1))), + "{}", r"b4:fo," + ); assert_eq!( binary()("b9:今日は,".as_bytes()), - Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))) + Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))), + "{}", r"b9:今日は," ); } @@ -546,7 +587,8 @@ pub mod parse { fn test_list() { assert_eq!( list_t("[0:]".as_bytes()), - Ok(("".as_bytes(), vec![])) + Ok(("".as_bytes(), vec![])), + "{}", r"[0:]" ); assert_eq!( list_t("[6:u,u,u,]".as_bytes()), @@ -554,7 +596,8 @@ pub mod parse { T::Unit, T::Unit, T::Unit, - ])) + ])), + "{}", r"[6:u,u,u,]" ); assert_eq!( list_t("[15:u,[7:t3:foo,]u,]".as_bytes()), @@ -562,7 +605,8 @@ pub mod parse { T::Unit, T::List(vec![T::Text("foo".to_owned())]), T::Unit, - ])) + ])), + "{}", r"[15:u,[7:t3:foo,]u,]" ); } @@ -574,7 +618,8 @@ pub mod parse { ("a".to_owned(), T::Unit), ("b".to_owned(), T::Unit), ("c".to_owned(), T::Unit), - ].into_iter().collect::>())) + ].into_iter().collect::>())), + "{}", r"{21:<1:a|u,<1:b|u,<1:c|u,}" ); // duplicated keys are ignored (first is taken) assert_eq!( @@ -582,7 +627,14 @@ pub mod parse { Ok(("".as_bytes(), vec![ ("a".to_owned(), T::Unit), ("b".to_owned(), T::Unit), - ].into_iter().collect::>())) + ].into_iter().collect::>())), + "{}", r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}" + ); + // empty records are not allowed + assert_eq!( + record_t("{0:}".as_bytes()), + Err(nom::Err::Failure(("".as_bytes(), nom::error::ErrorKind::Many1))), + "{}", r"{0:}" ); } @@ -590,18 +642,21 @@ pub mod parse { fn test_parse() { assert_eq!( t_t("n3:255,".as_bytes()), - Ok(("".as_bytes(), T::N3(255))) + Ok(("".as_bytes(), T::N3(255))), + "{}", r"n3:255," ); assert_eq!( t_t("t6:halloo,".as_bytes()), - Ok(("".as_bytes(), T::Text("halloo".to_owned()))) + Ok(("".as_bytes(), T::Text("halloo".to_owned()))), + "{}", r"t6:halloo," ); assert_eq!( t_t("<3:foo|t6:halloo,".as_bytes()), Ok(("".as_bytes(), T::Sum (Tag { tag: "foo".to_owned(), val: Box::new(T::Text("halloo".to_owned())) - }))) + }))), + "{}", r"<3:foo|t6:halloo," ); // { a: Unit // , foo: List } @@ -614,7 +669,8 @@ pub mod parse { T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }), T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }), ])) - ].into_iter().collect::>()))) + ].into_iter().collect::>()))), + "{}", r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}" ); }