feat(users/Profpatsch/netencode): fully streaming parser

In order to arbitrarily split netencode over multiple reads, we need to make the parser completely streaming, so that it recognizes all cases where it needs more input. Luckily, this is fairly trivial, after working around a bunch of overeager parsing. The tricky part was the giant `alt`, where inner parsers would start consuming input and thus become incomplete when they fail afterwards. Sinc the format *always* starts the different types with one discriminator char, we can use that to instantly return the parser and try the next one instead. The other tricky part was that lists and records would parse all inner elements and then choke on the empty string after the last element, because the inner parser would consume at least the descriminator, and an empty string is always `Incomplete`. We wrap these into a small combinator which plays nice with `many0` in that regard. Change-Id: Ib8d15d9a7cab19d432c6b24a35fcad6a5a72b246 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2704 Tested-by: BuildkiteCI Reviewed-by: Profpatsch <mail@profpatsch.de> Reviewed-by: sterni <sternenseemann@systemli.org>
2021-03-30 04:49:43 +02:00 · 2021-03-30 04:49:43 +02:00 · 59a9955d75
commit 59a9955d75
parent 53d8dd6a1e
1 changed files with 95 additions and 39 deletions
--- a/users/Profpatsch/netencode/netencode.rs
+++ b/users/Profpatsch/netencode/netencode.rs
@ -210,9 +210,9 @@ pub mod parse {
    use std::collections::HashMap;
    use nom::{IResult};
    use nom::bytes::complete::{tag, take};
    use nom::branch::{alt};
-    use nom::character::complete::{digit1, char};
+    use nom::bytes::streaming::{tag, take};
    use nom::character::streaming::{digit1, char};
    use nom::sequence::{tuple};
    use nom::combinator::{map, map_res, flat_map, map_parser, opt};
    use nom::error::{context, ErrorKind, ParseError};
@ -233,8 +233,10 @@ pub mod parse {
    fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
        move |s: &[u8]| {
-            let (s, (_, len, _)) = tuple((
+            // This is the point where we check the descriminator;
-                char(begin),
+            // if the beginning char does not match, we can immediately return.
            let (s, _) = char(begin)(s)?;
            let (s, (len, _)) = tuple((
                usize_t,
                char(':')
            ))(s)?;
@ -344,14 +346,33 @@ pub mod parse {
        list_g(t_t)(s)
    }
    /// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser
    /// is not called when the `s` is empty already, preventing it from
    /// returning `Incomplete` on streaming parsing.
    fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O>
    where
        O: Clone,
        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
    {
        move |s: &'a [u8]| {
            if s.is_empty() {
                // This is a bit hacky, `many0` considers the inside done
                // when a parser returns `Err::Error`, ignoring the actual error content
                Err(nom::Err::Error((s, nom::error::ErrorKind::Many0)))
            } else {
                inner(s)
            }
        }
    }
    fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>>
    where
        O: Clone,
-        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
+        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
    {
        map_parser(
            sized('[', ']'),
-            nom::multi::many0(inner)
+            nom::multi::many0(inner_no_empty_string(inner))
        )
    }
@ -368,21 +389,29 @@ pub mod parse {
        O: Clone,
        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
    {
-        map_parser(
+        move |s: &'a [u8]| {
-            sized('{', '}'),
+            let (s, map) = map_parser(
-            nom::multi::fold_many1(
+                sized('{', '}'),
-                tag_g(inner),
+                nom::multi::fold_many0(
-                HashMap::new(),
+                    inner_no_empty_string(tag_g(&inner)),
-                |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
+                    HashMap::new(),
-                    // ignore duplicated tag names that appear later
+                    |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
-                    // according to netencode spec
+                        // ignore duplicated tag names that appear later
-                    if ! acc.contains_key(tag) {
+                        // according to netencode spec
-                        acc.insert(tag, *val);
+                        if ! acc.contains_key(tag) {
                            acc.insert(tag, *val);
                        }
                        acc
                    }
-                    acc
+                )
-                }
+            )(s)?;
-            )
+            if map.is_empty() {
-        )
+                // records must not be empty, according to the spec
                Err(nom::Err::Failure((s,nom::error::ErrorKind::Many1)))
            } else {
                Ok((s, map))
            }
        }
    }
    pub fn u_u(s: &[u8]) -> IResult<&[u8], U> {
@ -512,16 +541,19 @@ pub mod parse {
        fn test_parse_text() {
            assert_eq!(
                text("t5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("hello".to_owned())))
+                Ok(("".as_bytes(), T::Text("hello".to_owned()))),
                "{}", r"t5:hello,"
            );
            assert_eq!(
-                text("t4:fo,".as_bytes()),
+                text("t4:fo".as_bytes()),
-                // TODO: way better parse error messages
+                // The content of the text should be 4 long
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
                "{}", r"t4:fo,"
            );
            assert_eq!(
                text("t9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("今日は".to_owned())))
+                Ok(("".as_bytes(), T::Text("今日は".to_owned()))),
                "{}", r"t9:今日は,"
            );
        }
@ -529,16 +561,25 @@ pub mod parse {
        fn test_parse_binary() {
            assert_eq!(
                binary()("b5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))),
                "{}", r"b5:hello,"
            );
            assert_eq!(
-                binary()("b4:fo,".as_bytes()),
+                binary()("b4:fo".as_bytes()),
-                // TODO: way better parse error messages
+                // The content of the byte should be 4 long
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
                "{}", r"b4:fo,"
            );
            assert_eq!(
                binary()("b4:foob".as_bytes()),
                // The content is 4 bytes now, but the finishing , is missing
                Err(nom::Err::Incomplete(nom::Needed::Size(1))),
                    "{}", r"b4:fo,"
                );
            assert_eq!(
                binary()("b9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))),
                "{}", r"b9:今日は,"
            );
        }
@ -546,7 +587,8 @@ pub mod parse {
        fn test_list() {
            assert_eq!(
                list_t("[0:]".as_bytes()),
-                Ok(("".as_bytes(), vec![]))
+                Ok(("".as_bytes(), vec![])),
                "{}", r"[0:]"
            );
            assert_eq!(
                list_t("[6:u,u,u,]".as_bytes()),
@ -554,7 +596,8 @@ pub mod parse {
                    T::Unit,
                    T::Unit,
                    T::Unit,
-                ]))
+                ])),
                "{}", r"[6:u,u,u,]"
            );
            assert_eq!(
                list_t("[15:u,[7:t3:foo,]u,]".as_bytes()),
@ -562,7 +605,8 @@ pub mod parse {
                    T::Unit,
                    T::List(vec![T::Text("foo".to_owned())]),
                    T::Unit,
-                ]))
+                ])),
                "{}", r"[15:u,[7:t3:foo,]u,]"
            );
        }
@ -574,7 +618,8 @@ pub mod parse {
                    ("a".to_owned(), T::Unit),
                    ("b".to_owned(), T::Unit),
                    ("c".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<String, T>>()))
+                ].into_iter().collect::<HashMap<String, T>>())),
                "{}", r"{21:<1:a|u,<1:b|u,<1:c|u,}"
            );
            // duplicated keys are ignored (first is taken)
            assert_eq!(
@ -582,7 +627,14 @@ pub mod parse {
                Ok(("".as_bytes(), vec![
                    ("a".to_owned(), T::Unit),
                    ("b".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<_,_>>()))
+                ].into_iter().collect::<HashMap<_,_>>())),
                "{}", r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}"
            );
            // empty records are not allowed
            assert_eq!(
                record_t("{0:}".as_bytes()),
                Err(nom::Err::Failure(("".as_bytes(), nom::error::ErrorKind::Many1))),
                "{}", r"{0:}"
            );
        }
@ -590,18 +642,21 @@ pub mod parse {
        fn test_parse() {
            assert_eq!(
                t_t("n3:255,".as_bytes()),
-                Ok(("".as_bytes(), T::N3(255)))
+                Ok(("".as_bytes(), T::N3(255))),
                "{}", r"n3:255,"
            );
            assert_eq!(
                t_t("t6:halloo,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("halloo".to_owned())))
+                Ok(("".as_bytes(), T::Text("halloo".to_owned()))),
                "{}", r"t6:halloo,"
            );
            assert_eq!(
                t_t("<3:foo|t6:halloo,".as_bytes()),
                Ok(("".as_bytes(), T::Sum (Tag {
                    tag: "foo".to_owned(),
                    val: Box::new(T::Text("halloo".to_owned()))
-                })))
+                }))),
                "{}", r"<3:foo|t6:halloo,"
            );
            // { a: Unit
            // , foo: List <A: Unit | B: List i3> }
@ -614,7 +669,8 @@ pub mod parse {
                        T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }),
                        T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }),
                    ]))
-                ].into_iter().collect::<HashMap<String, T>>())))
+                ].into_iter().collect::<HashMap<String, T>>()))),
                "{}", r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}"
            );
        }