refactor(sterni/nix/utf8): let wellFormedByte check first byte

Previously we would check the first byte only when trying to figure out the predicate for the second byte. If the first byte was invalid, we'd then throw with a helpful error message. However this made wellFormedByte a very weird function. At the expense of doing the same check twice, we now check the first byte, when it is first passed, and always return a boolean. Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa
2021-11-23 19:35:16 +01:00 · 2021-11-23 19:35:16 +01:00 · 0e9c770972
commit 0e9c770972
parent 87a0aaa77d
1 changed files with 14 additions and 17 deletions
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@ -38,9 +38,7 @@ let
     Based on table 3-7. from The Unicode Standard,
     Version 13.0, section 3.9.

-     Throws if the first byte is invalid.
-
-     Type: integer -> integer -> (integer -> bool)
+     Type: integer -> integer -> integer -> bool
  */
  wellFormedByte =
    # first byte's integer value
@ -49,16 +47,8 @@ let
    pos:
      let
        defaultRange = int.inRange 128 191;
-      in
-        # The first byte is either ASCII which requires no checks
-        # or we automatically check it when we check the subsequent
-        # bytes. The downside is that this may generate bad error
-        # messages in very rare cases.
-        if pos == 0
-        then lib.const true
-        else if pos > 1 # 3rd and 4th byte have only one validity rule
-        then defaultRange
-        else assert pos == 1; flow.switch first [
+
+        secondBytePredicate = flow.switch first [
          [ (int.inRange 194 223) defaultRange          ] # C2..DF
          [ 224                   (int.inRange 160 191) ] # E0
          [ (int.inRange 225 236) defaultRange          ] # E1..EC
@ -67,12 +57,19 @@ let
          [ 240                   (int.inRange 144 191) ] # F0
          [ (int.inRange 241 243) defaultRange          ] # F1..F3
          [ 244                   (int.inRange 128 143) ] # F4
-          [
-            (fun.const true)
-            (builtins.throw "Invalid first byte ${int.toHex first}")
-          ]
+          [ (fun.const true)      null                  ]
        ];

+        firstBytePredicate = byte: assert first == byte;
+          first < 128 || secondBytePredicate != null;
+      in
+        # Either ASCII or in one of the byte ranges of Table 3-6.
+        if pos == 0 then firstBytePredicate
+        # return predicate according to Table 3-6.
+        else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
+        # 3rd and 4th byte have only one validity rule
+        else defaultRange;
+
  /* Iteration step for decoding an UTF-8 byte sequence.
     It decodes incrementally, i. e. it has to be fed
     one byte at a time and then returns either a