refactor(sterni/nix/utf8): let wellFormedByte check first byte

Previously we would check the first byte only when trying to figure out
the predicate for the second byte. If the first byte was invalid, we'd
then throw with a helpful error message. However this made
wellFormedByte a very weird function.

At the expense of doing the same check twice, we now check the first
byte, when it is first passed, and always return a boolean.

Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa
This commit is contained in:
sterni 2021-11-23 19:35:16 +01:00
parent 87a0aaa77d
commit 0e9c770972

View file

@ -38,9 +38,7 @@ let
Based on table 3-7. from The Unicode Standard, Based on table 3-7. from The Unicode Standard,
Version 13.0, section 3.9. Version 13.0, section 3.9.
Throws if the first byte is invalid. Type: integer -> integer -> integer -> bool
Type: integer -> integer -> (integer -> bool)
*/ */
wellFormedByte = wellFormedByte =
# first byte's integer value # first byte's integer value
@ -49,16 +47,8 @@ let
pos: pos:
let let
defaultRange = int.inRange 128 191; defaultRange = int.inRange 128 191;
in
# The first byte is either ASCII which requires no checks secondBytePredicate = flow.switch first [
# or we automatically check it when we check the subsequent
# bytes. The downside is that this may generate bad error
# messages in very rare cases.
if pos == 0
then lib.const true
else if pos > 1 # 3rd and 4th byte have only one validity rule
then defaultRange
else assert pos == 1; flow.switch first [
[ (int.inRange 194 223) defaultRange ] # C2..DF [ (int.inRange 194 223) defaultRange ] # C2..DF
[ 224 (int.inRange 160 191) ] # E0 [ 224 (int.inRange 160 191) ] # E0
[ (int.inRange 225 236) defaultRange ] # E1..EC [ (int.inRange 225 236) defaultRange ] # E1..EC
@ -67,12 +57,19 @@ let
[ 240 (int.inRange 144 191) ] # F0 [ 240 (int.inRange 144 191) ] # F0
[ (int.inRange 241 243) defaultRange ] # F1..F3 [ (int.inRange 241 243) defaultRange ] # F1..F3
[ 244 (int.inRange 128 143) ] # F4 [ 244 (int.inRange 128 143) ] # F4
[ [ (fun.const true) null ]
(fun.const true)
(builtins.throw "Invalid first byte ${int.toHex first}")
]
]; ];
firstBytePredicate = byte: assert first == byte;
first < 128 || secondBytePredicate != null;
in
# Either ASCII or in one of the byte ranges of Table 3-6.
if pos == 0 then firstBytePredicate
# return predicate according to Table 3-6.
else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
# 3rd and 4th byte have only one validity rule
else defaultRange;
/* Iteration step for decoding an UTF-8 byte sequence. /* Iteration step for decoding an UTF-8 byte sequence.
It decodes incrementally, i. e. it has to be fed It decodes incrementally, i. e. it has to be fed
one byte at a time and then returns either a one byte at a time and then returns either a