feat(sterni/nix/utf8): check if codepoint valid/encodeable

* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if the given codepoint exceeds the range (encoding U+0000 won't work of course, but this is Nix's fault…). * Check if the produced bytes are well formed and output an error if not. This indicates that the codepoint can't be encoded as UTF-8, like U+D800 which is reserved for UTF-16. Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
2021-11-23 19:58:15 +01:00 · 2021-11-23 19:58:15 +01:00 · 750ef6c693
commit 750ef6c693
parent 8dc54f89cd
2 changed files with 42 additions and 4 deletions
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@ -204,6 +204,15 @@ let
      ) iterResult
    );

+  /* Pretty prints a Unicode codepoint in the U+<HEX> notation.
+
+     Type: integer -> string
+  */
+  formatCodepoint = cp: "U+" + string.fit {
+    width = 4;
+    char = "0";
+  } (int.toHex cp);
+
  encodeCodepoint = cp:
    let
      # Find the amount of bytes needed to encode the given codepoint.
@ -213,9 +222,14 @@ let
        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
-        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
+        [ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx,
+                                          # capped at U+10FFFF
+
+        [ (fun.const true) (builtins.throw invalidCodepointMsg) ]
      ];

+      invalidCodepointMsg = "${formatCodepoint cp} is not a Unicode codepoint";
+
      # Extract the bit ranges x, y, z and u from the given codepoint
      # according to Table 3-6. from The Unicode Standard, Version 13.0,
      # section 3.9. u is split into uh and ul since they are used in
@ -268,7 +282,20 @@ let
        (x + (if count > 1 then 128 else 0))
      ];

-    in string.fromBytes bytes;
+      firstByte = builtins.head bytes;
+
+      unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8";
+
+    in string.fromBytes (
+      builtins.genList (i:
+        let
+          byte = builtins.elemAt bytes i;
+        in
+          if wellFormedByte firstByte i byte
+          then byte
+          else builtins.throw unableToEncodeMessage
+      ) count
+    );

  /* Encode a list of Unicode codepoints into an UTF-8 string.

@ -281,5 +308,6 @@ in {
    encode
    decode
    step
+    formatCodepoint
    ;
 }