feat(sterni/nix/utf8): implement UTF-8 encoding
This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
This commit is contained in:
parent
9370ea5e33
commit
87a0aaa77d
2 changed files with 83 additions and 2 deletions
|
@ -2,8 +2,6 @@
|
|||
|
||||
let
|
||||
|
||||
# TODO(sterni): encode
|
||||
|
||||
inherit (depot.users.sterni.nix)
|
||||
char
|
||||
flow
|
||||
|
@ -209,8 +207,81 @@ let
|
|||
) iterResult
|
||||
);
|
||||
|
||||
encodeCodepoint = cp:
|
||||
let
|
||||
# Find the amount of bytes needed to encode the given codepoint.
|
||||
# Note that this doesn't check if the Unicode codepoint is allowed,
|
||||
# but rather allows all theoretically UTF-8-encodeable ones.
|
||||
count = flow.switch cp [
|
||||
[ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx
|
||||
[ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx
|
||||
[ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx
|
||||
[ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
|
||||
];
|
||||
|
||||
# Extract the bit ranges x, y, z and u from the given codepoint
|
||||
# according to Table 3-6. from The Unicode Standard, Version 13.0,
|
||||
# section 3.9. u is split into uh and ul since they are used in
|
||||
# different bytes in the end.
|
||||
components = lib.mapAttrs (_: { mask, offset }:
|
||||
int.bitAnd (int.bitShiftR cp offset) mask
|
||||
) {
|
||||
x = {
|
||||
mask = if count > 1 then 63 else 127;
|
||||
offset = 0;
|
||||
};
|
||||
y = {
|
||||
mask = if count > 2 then 63 else 31;
|
||||
offset = 6;
|
||||
};
|
||||
z = {
|
||||
mask = 15;
|
||||
offset = 12;
|
||||
};
|
||||
# u which belongs into the second byte
|
||||
ul = {
|
||||
mask = 3;
|
||||
offset = 16;
|
||||
};
|
||||
# u which belongs into the first byte
|
||||
uh = {
|
||||
mask = 7;
|
||||
offset = 18;
|
||||
};
|
||||
};
|
||||
inherit (components) x y z ul uh;
|
||||
|
||||
# Finally construct the byte sequence for the given codepoint. This is
|
||||
# usually done by using the component and adding a few bits as a prefix
|
||||
# which depends on the length of the sequence. The longer the sequence,
|
||||
# the further back each component is pushed. To simplify this, we
|
||||
# always construct a 4 element list and take the last `count` elements.
|
||||
# Thanks to laziness the bogus values created by this are never evaluated.
|
||||
#
|
||||
# Based on table 3-6. from The Unicode Standard,
|
||||
# Version 13.0, section 3.9.
|
||||
bytes = lib.sublist (4 - count) count [
|
||||
# 11110uuu
|
||||
(uh + 240)
|
||||
# 10uuzzzz or 1110zzzz
|
||||
(z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
|
||||
# 10yyyyyy or 110yyyyy
|
||||
(y + (if count > 2 then 128 else 192))
|
||||
# 10xxxxxx or 0xxxxxxx
|
||||
(x + (if count > 1 then 128 else 0))
|
||||
];
|
||||
|
||||
in string.fromBytes bytes;
|
||||
|
||||
/* Encode a list of Unicode codepoints into an UTF-8 string.
|
||||
|
||||
Type: [ integer ] -> string
|
||||
*/
|
||||
encode = lib.concatMapStrings encodeCodepoint;
|
||||
|
||||
in {
|
||||
inherit
|
||||
encode
|
||||
decode
|
||||
step
|
||||
;
|
||||
|
|
|
@ -113,9 +113,19 @@ let
|
|||
randomUnicode
|
||||
]));
|
||||
|
||||
testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
|
||||
(builtins.map
|
||||
(s: assertEq "Decoding and then encoding “${s}” yields itself"
|
||||
(utf8.encode (utf8.decode s)) s)
|
||||
(lib.flatten [
|
||||
glassSentences
|
||||
randomUnicode
|
||||
]));
|
||||
|
||||
in
|
||||
runTestsuite "nix.utf8" [
|
||||
testFailures
|
||||
testAscii
|
||||
testDecoding
|
||||
testDecodingEncoding
|
||||
]
|
||||
|
|
Loading…
Reference in a new issue