From 59a9955d753d8f9deb705d36922f6e8d77307f1d Mon Sep 17 00:00:00 2001
From: Profpatsch <mail@profpatsch.de>
Date: Tue, 30 Mar 2021 04:49:43 +0200
Subject: [PATCH] feat(users/Profpatsch/netencode): fully streaming parser

In order to arbitrarily split netencode over multiple reads, we need
to make the parser completely streaming, so that it recognizes all
cases where it needs more input.

Luckily, this is fairly trivial, after working around a bunch of
overeager parsing.

The tricky part was the giant `alt`, where inner parsers would start
consuming input and thus become incomplete when they fail afterwards.
Sinc the format *always* starts the different types with one
discriminator char, we can use that to instantly return the parser and
try the next one instead.

The other tricky part was that lists and records would parse all inner
elements and then choke on the empty string after the last element,
because the inner parser would consume at least the descriminator, and
an empty string is always `Incomplete`. We wrap these into a small
combinator which plays nice with `many0` in that regard.

Change-Id: Ib8d15d9a7cab19d432c6b24a35fcad6a5a72b246
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2704
Tested-by: BuildkiteCI
Reviewed-by: Profpatsch <mail@profpatsch.de>
Reviewed-by: sterni <sternenseemann@systemli.org>
---
 users/Profpatsch/netencode/netencode.rs | 134 +++++++++++++++++-------
 1 file changed, 95 insertions(+), 39 deletions(-)
diff --git a/users/Profpatsch/netencode/netencode.rs b/users/Profpatsch/netencode/netencode.rs
index 280032609..249cc33ed 100644
--- a/users/Profpatsch/netencode/netencode.rs
+++ b/users/Profpatsch/netencode/netencode.rs
@@ -210,9 +210,9 @@ pub mod parse {
     use std::collections::HashMap;
 
     use nom::{IResult};
-    use nom::bytes::complete::{tag, take};
     use nom::branch::{alt};
-    use nom::character::complete::{digit1, char};
+    use nom::bytes::streaming::{tag, take};
+    use nom::character::streaming::{digit1, char};
     use nom::sequence::{tuple};
     use nom::combinator::{map, map_res, flat_map, map_parser, opt};
     use nom::error::{context, ErrorKind, ParseError};
@@ -233,8 +233,10 @@ pub mod parse {
 
     fn sized(begin: char, end: char) -> impl Fn(&[u8]) -> IResult<&[u8], &[u8]> {
         move |s: &[u8]| {
-            let (s, (_, len, _)) = tuple((
-                char(begin),
+            // This is the point where we check the descriminator;
+            // if the beginning char does not match, we can immediately return.
+            let (s, _) = char(begin)(s)?;
+            let (s, (len, _)) = tuple((
                 usize_t,
                 char(':')
             ))(s)?;
@@ -344,14 +346,33 @@ pub mod parse {
         list_g(t_t)(s)
     }
 
+    /// Wrap the inner parser of an `many0`/`fold_many0`, so that the parser
+    /// is not called when the `s` is empty already, preventing it from
+    /// returning `Incomplete` on streaming parsing.
+    fn inner_no_empty_string<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], O>
+    where
+        O: Clone,
+        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
+    {
+        move |s: &'a [u8]| {
+            if s.is_empty() {
+                // This is a bit hacky, `many0` considers the inside done
+                // when a parser returns `Err::Error`, ignoring the actual error content
+                Err(nom::Err::Error((s, nom::error::ErrorKind::Many0)))
+            } else {
+                inner(s)
+            }
+        }
+    }
+
     fn list_g<'a, P, O>(inner: P) -> impl Fn(&'a [u8]) -> IResult<&'a [u8], Vec<O>>
     where
         O: Clone,
-        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
+        P: Fn(&'a [u8]) -> IResult<&'a [u8], O>,
     {
         map_parser(
             sized('[', ']'),
-            nom::multi::many0(inner)
+            nom::multi::many0(inner_no_empty_string(inner))
         )
     }
 
@@ -368,21 +389,29 @@ pub mod parse {
         O: Clone,
         P: Fn(&'a [u8]) -> IResult<&'a [u8], O>
     {
-        map_parser(
-            sized('{', '}'),
-            nom::multi::fold_many1(
-                tag_g(inner),
-                HashMap::new(),
-                |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
-                    // ignore duplicated tag names that appear later
-                    // according to netencode spec
-                    if ! acc.contains_key(tag) {
-                        acc.insert(tag, *val);
+        move |s: &'a [u8]| {
+            let (s, map) = map_parser(
+                sized('{', '}'),
+                nom::multi::fold_many0(
+                    inner_no_empty_string(tag_g(&inner)),
+                    HashMap::new(),
+                    |mut acc: HashMap<_,_>, Tag { tag, mut val }| {
+                        // ignore duplicated tag names that appear later
+                        // according to netencode spec
+                        if ! acc.contains_key(tag) {
+                            acc.insert(tag, *val);
+                        }
+                        acc
                     }
-                    acc
-                }
-            )
-        )
+                )
+            )(s)?;
+            if map.is_empty() {
+                // records must not be empty, according to the spec
+                Err(nom::Err::Failure((s,nom::error::ErrorKind::Many1)))
+            } else {
+                Ok((s, map))
+            }
+        }
     }
 
     pub fn u_u(s: &[u8]) -> IResult<&[u8], U> {
@@ -512,16 +541,19 @@ pub mod parse {
         fn test_parse_text() {
             assert_eq!(
                 text("t5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("hello".to_owned())))
+                Ok(("".as_bytes(), T::Text("hello".to_owned()))),
+                "{}", r"t5:hello,"
             );
             assert_eq!(
-                text("t4:fo,".as_bytes()),
-                // TODO: way better parse error messages
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                text("t4:fo".as_bytes()),
+                // The content of the text should be 4 long
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
+                "{}", r"t4:fo,"
             );
             assert_eq!(
                 text("t9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("今日は".to_owned())))
+                Ok(("".as_bytes(), T::Text("今日は".to_owned()))),
+                "{}", r"t9:今日は,"
             );
         }
 
@@ -529,16 +561,25 @@ pub mod parse {
         fn test_parse_binary() {
             assert_eq!(
                 binary()("b5:hello,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("hello".to_owned())))),
+                "{}", r"b5:hello,"
             );
             assert_eq!(
-                binary()("b4:fo,".as_bytes()),
-                // TODO: way better parse error messages
-                Err(nom::Err::Error(("fo,".as_bytes(), nom::error::ErrorKind::Eof)))
+                binary()("b4:fo".as_bytes()),
+                // The content of the byte should be 4 long
+                Err(nom::Err::Incomplete(nom::Needed::Size(4))),
+                "{}", r"b4:fo,"
             );
+            assert_eq!(
+                binary()("b4:foob".as_bytes()),
+                // The content is 4 bytes now, but the finishing , is missing
+                Err(nom::Err::Incomplete(nom::Needed::Size(1))),
+                    "{}", r"b4:fo,"
+                );
             assert_eq!(
                 binary()("b9:今日は,".as_bytes()),
-                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes()))))
+                Ok(("".as_bytes(), T::Binary(Vec::from("今日は".as_bytes())))),
+                "{}", r"b9:今日は,"
             );
         }
 
@@ -546,7 +587,8 @@ pub mod parse {
         fn test_list() {
             assert_eq!(
                 list_t("[0:]".as_bytes()),
-                Ok(("".as_bytes(), vec![]))
+                Ok(("".as_bytes(), vec![])),
+                "{}", r"[0:]"
             );
             assert_eq!(
                 list_t("[6:u,u,u,]".as_bytes()),
@@ -554,7 +596,8 @@ pub mod parse {
                     T::Unit,
                     T::Unit,
                     T::Unit,
-                ]))
+                ])),
+                "{}", r"[6:u,u,u,]"
             );
             assert_eq!(
                 list_t("[15:u,[7:t3:foo,]u,]".as_bytes()),
@@ -562,7 +605,8 @@ pub mod parse {
                     T::Unit,
                     T::List(vec![T::Text("foo".to_owned())]),
                     T::Unit,
-                ]))
+                ])),
+                "{}", r"[15:u,[7:t3:foo,]u,]"
             );
         }
 
@@ -574,7 +618,8 @@ pub mod parse {
                     ("a".to_owned(), T::Unit),
                     ("b".to_owned(), T::Unit),
                     ("c".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<String, T>>()))
+                ].into_iter().collect::<HashMap<String, T>>())),
+                "{}", r"{21:<1:a|u,<1:b|u,<1:c|u,}"
             );
             // duplicated keys are ignored (first is taken)
             assert_eq!(
@@ -582,7 +627,14 @@ pub mod parse {
                 Ok(("".as_bytes(), vec![
                     ("a".to_owned(), T::Unit),
                     ("b".to_owned(), T::Unit),
-                ].into_iter().collect::<HashMap<_,_>>()))
+                ].into_iter().collect::<HashMap<_,_>>())),
+                "{}", r"{25:<1:a|u,<1:b|u,<1:a|i1:-1,}"
+            );
+            // empty records are not allowed
+            assert_eq!(
+                record_t("{0:}".as_bytes()),
+                Err(nom::Err::Failure(("".as_bytes(), nom::error::ErrorKind::Many1))),
+                "{}", r"{0:}"
             );
         }
 
@@ -590,18 +642,21 @@ pub mod parse {
         fn test_parse() {
             assert_eq!(
                 t_t("n3:255,".as_bytes()),
-                Ok(("".as_bytes(), T::N3(255)))
+                Ok(("".as_bytes(), T::N3(255))),
+                "{}", r"n3:255,"
             );
             assert_eq!(
                 t_t("t6:halloo,".as_bytes()),
-                Ok(("".as_bytes(), T::Text("halloo".to_owned())))
+                Ok(("".as_bytes(), T::Text("halloo".to_owned()))),
+                "{}", r"t6:halloo,"
             );
             assert_eq!(
                 t_t("<3:foo|t6:halloo,".as_bytes()),
                 Ok(("".as_bytes(), T::Sum (Tag {
                     tag: "foo".to_owned(),
                     val: Box::new(T::Text("halloo".to_owned()))
-                })))
+                }))),
+                "{}", r"<3:foo|t6:halloo,"
             );
             // { a: Unit
             // , foo: List <A: Unit | B: List i3> }
@@ -614,7 +669,8 @@ pub mod parse {
                         T::Sum(Tag { tag: "A".to_owned(), val: Box::new(T::N1(true)) }),
                         T::Sum(Tag { tag: "B".to_owned(), val: Box::new(T::List(vec![T::I3(127)])) }),
                     ]))
-                ].into_iter().collect::<HashMap<String, T>>())))
+                ].into_iter().collect::<HashMap<String, T>>()))),
+                "{}", r"{52:<1:a|u,<3:foo|[33:<1:A|u,<1:A|n1:1,<1:B|[7:i3:127,]]}"
             );
         }