feat(sterni/nix/utf8): check if codepoint valid/encodeable

* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if
  the given codepoint exceeds the range (encoding U+0000 won't work of
  course, but this is Nix's fault…).

* Check if the produced bytes are well formed and output an error if
  not. This indicates that the codepoint can't be encoded as UTF-8, like
  U+D800 which is reserved for UTF-16.

Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
This commit is contained in:
sterni 2021-11-23 19:58:15 +01:00
parent 8dc54f89cd
commit 750ef6c693
2 changed files with 42 additions and 4 deletions

View file

@ -204,6 +204,15 @@ let
) iterResult
);
/* Pretty prints a Unicode codepoint in the U+<HEX> notation.
Type: integer -> string
*/
formatCodepoint = cp: "U+" + string.fit {
width = 4;
char = "0";
} (int.toHex cp);
encodeCodepoint = cp:
let
# Find the amount of bytes needed to encode the given codepoint.
@ -213,9 +222,14 @@ let
[ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx
[ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx
[ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx
[ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
[ (int.inRange 65536 1114111) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx,
# capped at U+10FFFF
[ (fun.const true) (builtins.throw invalidCodepointMsg) ]
];
invalidCodepointMsg = "${formatCodepoint cp} is not a Unicode codepoint";
# Extract the bit ranges x, y, z and u from the given codepoint
# according to Table 3-6. from The Unicode Standard, Version 13.0,
# section 3.9. u is split into uh and ul since they are used in
@ -268,7 +282,20 @@ let
(x + (if count > 1 then 128 else 0))
];
in string.fromBytes bytes;
firstByte = builtins.head bytes;
unableToEncodeMessage = "Can't encode ${formatCodepoint cp} as UTF-8";
in string.fromBytes (
builtins.genList (i:
let
byte = builtins.elemAt bytes i;
in
if wellFormedByte firstByte i byte
then byte
else builtins.throw unableToEncodeMessage
) count
);
/* Encode a list of Unicode codepoints into an UTF-8 string.
@ -281,5 +308,6 @@ in {
encode
decode
step
formatCodepoint
;
}