tvl-depot/users/sterni/nix/utf8/tests/default.nix
sterni b810c46a45 feat(users/sterni/nix/utf8): pure nix utf-8 decoder
users.sterni.nix.utf8 implements UTF-8 decoding in pure nix. We
implement the decoding as a simple state machine which is fed one byte
at a time. Decoding whole strings is possible by subsequently calling
step. This is done in decode which uses builtins.foldl' to get around
recursion restrictions and a neat trick using builtins.deepSeq puck
showed me limiting the size of the thunks in a foldl' (which can also
cause a stack overflow).

This makes decoding arbitrarily large UTF-8 files into codepoints using
nix theoretically possible, but it is not really practical: Decoding a
36KB LaTeX file I had lying around takes ~160s on my laptop.

Change-Id: Iab8c973dac89074ec280b4880a7408e0b3d19bc7
Reviewed-on: https://cl.tvl.fyi/c/depot/+/2590
Tested-by: BuildkiteCI
Reviewed-by: sterni <sternenseemann@systemli.org>
2021-03-05 11:07:41 +00:00

121 lines
4.1 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{ depot, lib, ... }:
let
inherit (depot.third_party)
runCommandLocal
;
inherit (depot.nix.runTestsuite)
runTestsuite
it
assertEq
assertThrows
assertDoesNotThrow
;
inherit (depot.users.Profpatsch.writers)
rustSimple
;
inherit (depot.users.sterni.nix)
int
utf8
string
char
;
rustDecoder = rustSimple {
name = "utf8-decode";
} ''
use std::io::{self, Read};
fn main() -> std::io::Result<()> {
let mut buffer = String::new();
io::stdin().read_to_string(&mut buffer)?;
print!("[ ");
for c in buffer.chars() {
print!("{} ", u32::from(c));
}
print!("]");
Ok(())
}
'';
rustDecode = s:
let
expr = runCommandLocal "${s}-decoded" {} ''
printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
'';
in import expr;
hexDecode = l:
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
testFailures = it "checks UTF-8 decoding failures" [
(assertThrows "emtpy bytestring throws" (utf8.decode ""))
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
# examples from The Unicode Standard
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
];
testAscii = it "checks decoding of ascii strings"
(builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
(string.toBytes s) (utf8.decode s)) [
"foo bar"
"hello\nworld"
"carriage\r\nreturn"
"1238398494829304 []<><>({})[]!!)"
(string.take 127 char.allChars)
]);
randomUnicode = [
"🥰👨👨👧👦🐈👩🏽🦰"
# https://kermitproject.org/utf8.html
""
"An preost wes on leoden, Laȝamon was ihoten"
"Sîne klâwen durh die wolken sint geslagen,"
"Τ γλσσα μο δωσαν λληνικ"
"На берегу пустынных волн"
" "
"ி ிி ிி ி , "
" ಿ ಿ "
];
# https://kermitproject.org/utf8.html
glassSentences = [
"Euro Symbol: ."
"Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
"Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
"Polish: Mogę jeść szkło, i mi nie szkodzi."
"Romanian: Pot să mănânc sticlă și ea nu mă rănește."
"Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
"Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
"Georgian: ."
"Hindi: , ."
"Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
"Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
"Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
"Japanese: "
"Thai: "
];
testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
(builtins.map
(s: assertEq "Decoding of ${s} is correct" (utf8.decode s) (rustDecode s))
(lib.flatten [
glassSentences
randomUnicode
]));
in
runTestsuite "nix.utf8" [
testFailures
testAscii
testDecoding
]