refactor(sterni/nix/utf8): let wellFormedByte check first byte

Previously we would check the first byte only when trying to figure out
the predicate for the second byte. If the first byte was invalid, we'd
then throw with a helpful error message. However this made
wellFormedByte a very weird function.

At the expense of doing the same check twice, we now check the first
byte, when it is first passed, and always return a boolean.

Change-Id: I32ab6051c844711849e5b4a115e2511b53682baa
This commit is contained in:
sterni 2021-11-23 19:35:16 +01:00
parent 87a0aaa77d
commit 0e9c770972

View file

@ -38,9 +38,7 @@ let
Based on table 3-7. from The Unicode Standard,
Version 13.0, section 3.9.
Throws if the first byte is invalid.
Type: integer -> integer -> (integer -> bool)
Type: integer -> integer -> integer -> bool
*/
wellFormedByte =
# first byte's integer value
@ -49,16 +47,8 @@ let
pos:
let
defaultRange = int.inRange 128 191;
in
# The first byte is either ASCII which requires no checks
# or we automatically check it when we check the subsequent
# bytes. The downside is that this may generate bad error
# messages in very rare cases.
if pos == 0
then lib.const true
else if pos > 1 # 3rd and 4th byte have only one validity rule
then defaultRange
else assert pos == 1; flow.switch first [
secondBytePredicate = flow.switch first [
[ (int.inRange 194 223) defaultRange ] # C2..DF
[ 224 (int.inRange 160 191) ] # E0
[ (int.inRange 225 236) defaultRange ] # E1..EC
@ -67,12 +57,19 @@ let
[ 240 (int.inRange 144 191) ] # F0
[ (int.inRange 241 243) defaultRange ] # F1..F3
[ 244 (int.inRange 128 143) ] # F4
[
(fun.const true)
(builtins.throw "Invalid first byte ${int.toHex first}")
]
[ (fun.const true) null ]
];
firstBytePredicate = byte: assert first == byte;
first < 128 || secondBytePredicate != null;
in
# Either ASCII or in one of the byte ranges of Table 3-6.
if pos == 0 then firstBytePredicate
# return predicate according to Table 3-6.
else if pos == 1 then assert secondBytePredicate != null; secondBytePredicate
# 3rd and 4th byte have only one validity rule
else defaultRange;
/* Iteration step for decoding an UTF-8 byte sequence.
It decodes incrementally, i. e. it has to be fed
one byte at a time and then returns either a