87a0aaa77d
This implementation is still a bit rough as it doesn't check if the produced string is valid UTF-8 which may happen if an invalid Unicode codepoint is passed. Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
288 lines
9.6 KiB
Nix
288 lines
9.6 KiB
Nix
{ depot, lib, ... }:
|
|
|
|
let
|
|
|
|
inherit (depot.users.sterni.nix)
|
|
char
|
|
flow
|
|
fun
|
|
int
|
|
string
|
|
util
|
|
;
|
|
|
|
/* (Internal) function to determine the amount
|
|
bytes left in a UTF-8 byte sequence from the
|
|
first byte.
|
|
|
|
This function will throw if the given first
|
|
byte is ill-formed, but will not detect all
|
|
cases of ill-formed-ness.
|
|
|
|
Based on table 3-6. from The Unicode Standard,
|
|
Version 13.0, section 3.9.
|
|
|
|
Type: integer -> integer
|
|
*/
|
|
byteCount = i: flow.cond [
|
|
[ (int.bitAnd i 128 == 0) 1 ]
|
|
[ (int.bitAnd i 224 == 192) 2 ]
|
|
[ (int.bitAnd i 240 == 224) 3 ]
|
|
[ (int.bitAnd i 248 == 240) 4 ]
|
|
[ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ]
|
|
];
|
|
|
|
/* (Internal) function to check if a given byte in
|
|
an UTF-8 byte sequence is well-formed.
|
|
|
|
Based on table 3-7. from The Unicode Standard,
|
|
Version 13.0, section 3.9.
|
|
|
|
Throws if the first byte is invalid.
|
|
|
|
Type: integer -> integer -> (integer -> bool)
|
|
*/
|
|
wellFormedByte =
|
|
# first byte's integer value
|
|
first:
|
|
# byte position as an index starting with 0
|
|
pos:
|
|
let
|
|
defaultRange = int.inRange 128 191;
|
|
in
|
|
# The first byte is either ASCII which requires no checks
|
|
# or we automatically check it when we check the subsequent
|
|
# bytes. The downside is that this may generate bad error
|
|
# messages in very rare cases.
|
|
if pos == 0
|
|
then lib.const true
|
|
else if pos > 1 # 3rd and 4th byte have only one validity rule
|
|
then defaultRange
|
|
else assert pos == 1; flow.switch first [
|
|
[ (int.inRange 194 223) defaultRange ] # C2..DF
|
|
[ 224 (int.inRange 160 191) ] # E0
|
|
[ (int.inRange 225 236) defaultRange ] # E1..EC
|
|
[ 237 (int.inRange 128 159) ] # ED
|
|
[ (int.inRange 238 239) defaultRange ] # EE..EF
|
|
[ 240 (int.inRange 144 191) ] # F0
|
|
[ (int.inRange 241 243) defaultRange ] # F1..F3
|
|
[ 244 (int.inRange 128 143) ] # F4
|
|
[
|
|
(fun.const true)
|
|
(builtins.throw "Invalid first byte ${int.toHex first}")
|
|
]
|
|
];
|
|
|
|
/* Iteration step for decoding an UTF-8 byte sequence.
|
|
It decodes incrementally, i. e. it has to be fed
|
|
one byte at a time and then returns either a
|
|
new state or a final result.
|
|
|
|
If the resulting attribute set contains the attribute
|
|
result, it is finished and the decoded codepoint is
|
|
contained in that attribute. In all other cases,
|
|
pass the returned set to step again along with
|
|
a new byte. The initial state to pass is the empty
|
|
set.
|
|
|
|
Extra attributes are always passed through, so you
|
|
can pass extra state. Be sure not to use result,
|
|
pos, code, first or count.
|
|
|
|
This function will throw with a fairly detailed
|
|
message if it encounters ill-formed bytes.
|
|
|
|
The implementation is based on The Unicode Standard,
|
|
Version 13.0, section 3.9, especially table 3-6.
|
|
|
|
Type: { ... } -> string -> ({ result :: integer, ... } | { ... })
|
|
|
|
Example: utf8.step {} "f"
|
|
=> { result = 102; }
|
|
*/
|
|
step = { pos ? 0, code ? 0, ... }@args: byte:
|
|
let
|
|
value = char.ord byte;
|
|
# first byte is context for well-formed-ness
|
|
first = args.first or value;
|
|
count = args.count or (byteCount first);
|
|
newCode =
|
|
if count == 1
|
|
then int.bitAnd 127 first # ascii character
|
|
else # multi byte UTF-8 sequence
|
|
let
|
|
# Calculate the bitmask for extracting the
|
|
# codepoint data in the current byte.
|
|
# If the codepoint is not ASCII, the bits
|
|
# used for codepoint data differ depending
|
|
# on the byte position and overall byte
|
|
# count. The first byte always ignores
|
|
# the (count + 1) most significant bits.
|
|
# For all subsequent bytes, the 2 most
|
|
# significant bits need to be ignored.
|
|
# See also table 3-6.
|
|
mask =
|
|
if pos == 0
|
|
then int.exp 2 (8 - (count + 1)) - 1
|
|
else 63;
|
|
# UTF-8 uses the 6 least significant bits in all
|
|
# subsequent bytes after the first one. Therefore
|
|
# We can determine the amount we need to shift
|
|
# the current value by the amount of bytes left.
|
|
offset = (count - (pos + 1)) * 6;
|
|
in
|
|
code + (int.bitShiftL (int.bitAnd mask value) offset);
|
|
illFormedMsg =
|
|
"Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";
|
|
in
|
|
if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
|
|
else if pos + 1 == count
|
|
then (builtins.removeAttrs args [ # allow extra state being passed through
|
|
"count"
|
|
"code"
|
|
"pos"
|
|
"first"
|
|
]) // { result = newCode; }
|
|
else (builtins.removeAttrs args [ "result" ]) // {
|
|
inherit count first;
|
|
code = newCode;
|
|
pos = pos + 1;
|
|
};
|
|
|
|
/* Decode an UTF-8 string into a list of codepoints.
|
|
|
|
Throws if the string is ill-formed UTF-8.
|
|
|
|
Type: string -> [ integer ]
|
|
*/
|
|
# TODO(sterni): option to fallback to replacement char instead of failure
|
|
decode = s:
|
|
let
|
|
stringLength = builtins.stringLength s;
|
|
iterResult = builtins.genericClosure {
|
|
startSet = [
|
|
{
|
|
key = "start";
|
|
stringIndex = -1;
|
|
state = {};
|
|
codepoint = null;
|
|
}
|
|
];
|
|
operator = { state, stringIndex, ... }:
|
|
let
|
|
# updated values for current iteration step
|
|
newIndex = stringIndex + 1;
|
|
newState = step state (builtins.substring newIndex 1 s);
|
|
in lib.optional (newIndex < stringLength) {
|
|
# unique keys to make genericClosure happy
|
|
key = toString newIndex;
|
|
# carryover state for the next step
|
|
stringIndex = newIndex;
|
|
state = newState;
|
|
# actual payload for later, steps with value null are filtered out
|
|
codepoint = newState.result or null;
|
|
};
|
|
};
|
|
in
|
|
# extract all steps that yield a code point into a list
|
|
builtins.map (v: v.codepoint) (
|
|
builtins.filter (
|
|
{ codepoint, stringIndex, state, ... }:
|
|
|
|
let
|
|
# error message in case we are missing bytes at the end of input
|
|
earlyEndMsg =
|
|
if state ? count && state ? pos
|
|
then "Missing ${toString (with state; count - pos)} bytes at end of input"
|
|
else "Unexpected end of input";
|
|
in
|
|
|
|
# filter out all iteration steps without a codepoint value
|
|
codepoint != null
|
|
# if we are at the iteration step of a non-empty input string, throw
|
|
# an error if no codepoint was returned, as it indicates an incomplete
|
|
# UTF-8 sequence.
|
|
|| (stringLength > 0 && stringIndex == stringLength - 1 && throw earlyEndMsg)
|
|
|
|
) iterResult
|
|
);
|
|
|
|
encodeCodepoint = cp:
|
|
let
|
|
# Find the amount of bytes needed to encode the given codepoint.
|
|
# Note that this doesn't check if the Unicode codepoint is allowed,
|
|
# but rather allows all theoretically UTF-8-encodeable ones.
|
|
count = flow.switch cp [
|
|
[ (int.inRange 0 127) 1 ] # 00000000 0xxxxxxx
|
|
[ (int.inRange 128 2047) 2 ] # 00000yyy yyxxxxxx
|
|
[ (int.inRange 2048 65535) 3 ] # zzzzyyyy yyxxxxxx
|
|
[ (int.inRange 65536 2097151) 4 ] # 000uuuuu zzzzyyyy yyxxxxxx
|
|
];
|
|
|
|
# Extract the bit ranges x, y, z and u from the given codepoint
|
|
# according to Table 3-6. from The Unicode Standard, Version 13.0,
|
|
# section 3.9. u is split into uh and ul since they are used in
|
|
# different bytes in the end.
|
|
components = lib.mapAttrs (_: { mask, offset }:
|
|
int.bitAnd (int.bitShiftR cp offset) mask
|
|
) {
|
|
x = {
|
|
mask = if count > 1 then 63 else 127;
|
|
offset = 0;
|
|
};
|
|
y = {
|
|
mask = if count > 2 then 63 else 31;
|
|
offset = 6;
|
|
};
|
|
z = {
|
|
mask = 15;
|
|
offset = 12;
|
|
};
|
|
# u which belongs into the second byte
|
|
ul = {
|
|
mask = 3;
|
|
offset = 16;
|
|
};
|
|
# u which belongs into the first byte
|
|
uh = {
|
|
mask = 7;
|
|
offset = 18;
|
|
};
|
|
};
|
|
inherit (components) x y z ul uh;
|
|
|
|
# Finally construct the byte sequence for the given codepoint. This is
|
|
# usually done by using the component and adding a few bits as a prefix
|
|
# which depends on the length of the sequence. The longer the sequence,
|
|
# the further back each component is pushed. To simplify this, we
|
|
# always construct a 4 element list and take the last `count` elements.
|
|
# Thanks to laziness the bogus values created by this are never evaluated.
|
|
#
|
|
# Based on table 3-6. from The Unicode Standard,
|
|
# Version 13.0, section 3.9.
|
|
bytes = lib.sublist (4 - count) count [
|
|
# 11110uuu
|
|
(uh + 240)
|
|
# 10uuzzzz or 1110zzzz
|
|
(z + (if count > 3 then 128 + int.bitShiftL ul 4 else 224))
|
|
# 10yyyyyy or 110yyyyy
|
|
(y + (if count > 2 then 128 else 192))
|
|
# 10xxxxxx or 0xxxxxxx
|
|
(x + (if count > 1 then 128 else 0))
|
|
];
|
|
|
|
in string.fromBytes bytes;
|
|
|
|
/* Encode a list of Unicode codepoints into an UTF-8 string.
|
|
|
|
Type: [ integer ] -> string
|
|
*/
|
|
encode = lib.concatMapStrings encodeCodepoint;
|
|
|
|
in {
|
|
inherit
|
|
encode
|
|
decode
|
|
step
|
|
;
|
|
}
|