feat(sterni/nix/utf8): implement UTF-8 encoding

This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
2021-11-23 19:23:54 +01:00 · 2021-11-23 19:23:54 +01:00 · 87a0aaa77d
commit 87a0aaa77d
parent 9370ea5e33
2 changed files with 83 additions and 2 deletions
--- a/users/sterni/nix/utf8/default.nix
+++ b/users/sterni/nix/utf8/default.nix
@ -2,8 +2,6 @@

 let

-  # TODO(sterni): encode
-
  inherit (depot.users.sterni.nix)
    char
    flow
@ -209,8 +207,81 @@ let
      ) iterResult
    );

+  encodeCodepoint = cp:
+    let
+      # Find the amount of bytes needed to encode the given codepoint.
+      # Note that this doesn't check if the Unicode codepoint is allowed,
+      # but rather allows all theoretically UTF-8-encodeable ones.
+      count = flow.switch cp [
+        [ (int.inRange 0 127)         1 ] # 00000000 0xxxxxxx
+        [ (int.inRange 128 2047)      2 ] # 00000yyy yyxxxxxx
+        [ (int.inRange 2048 65535)    3 ] # zzzzyyyy yyxxxxxx
+        [ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
+      ];
+
+      # Extract the bit ranges x, y, z and u from the given codepoint
+      # according to Table 3-6. from The Unicode Standard, Version 13.0,
+      # section 3.9. u is split into uh and ul since they are used in
+      # different bytes in the end.
+      components = lib.mapAttrs (_: { mask, offset }:
+        int.bitAnd (int.bitShiftR cp offset) mask
+      ) {
+        x = {
+          mask = if count > 1 then 63 else 127;
+          offset = 0;
+        };
+        y = {
+          mask = if count > 2 then 63 else 31;
+          offset = 6;
+        };
+        z = {
+          mask = 15;
+          offset = 12;
+        };
+        # u which belongs into the second byte
+        ul = {
+          mask = 3;
+          offset = 16;
+        };
+        # u which belongs into the first byte
+        uh = {
+          mask = 7;
+          offset = 18;
+        };
+      };
+      inherit (components) x y z ul uh;
+
+      # Finally construct the byte sequence for the given codepoint. This is
+      # usually done by using the component and adding a few bits as a prefix
+      # which depends on the length of the sequence. The longer the sequence,
+      # the further back each component is pushed. To simplify this, we
+      # always construct a 4 element list and take the last `count` elements.
+      # Thanks to laziness the bogus values created by this are never evaluated.
+      #
+      # Based on table 3-6. from The Unicode Standard,
+      # Version 13.0, section 3.9.
+      bytes = lib.sublist (4 - count) count [
+        # 11110uuu
+        (uh + 240)
+        # 10uuzzzz or 1110zzzz
+        (z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
+        # 10yyyyyy or 110yyyyy
+        (y + (if count > 2 then 128 else 192))
+        # 10xxxxxx or 0xxxxxxx
+        (x + (if count > 1 then 128 else 0))
+      ];
+
+    in string.fromBytes bytes;
+
+  /* Encode a list of Unicode codepoints into an UTF-8 string.
+
+     Type: [ integer ] -> string
+  */
+  encode = lib.concatMapStrings encodeCodepoint;
+
 in {
  inherit
+    encode
    decode
    step
    ;
--- a/users/sterni/nix/utf8/tests/default.nix
+++ b/users/sterni/nix/utf8/tests/default.nix
@ -113,9 +113,19 @@ let
        randomUnicode
      ]));

+  testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
+    (builtins.map
+      (s: assertEq "Decoding and then encoding “${s}” yields itself"
+        (utf8.encode (utf8.decode s)) s)
+      (lib.flatten [
+        glassSentences
+        randomUnicode
+      ]));
+
 in
  runTestsuite "nix.utf8" [
    testFailures
    testAscii
    testDecoding
+    testDecodingEncoding
  ]