tvl-depot/users/sterni/nix/utf8/tests/default.nix
sterni 87a0aaa77d feat(sterni/nix/utf8): implement UTF-8 encoding
This implementation is still a bit rough as it doesn't check if the
produced string is valid UTF-8 which may happen if an invalid Unicode
codepoint is passed.

Change-Id: Ibaa91dafa8937142ef704a175efe967b62e3ee7b
2021-11-25 12:15:35 +01:00

131 lines
4.3 KiB
Nix
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{ depot, pkgs, lib, ... }:
let
inherit (pkgs)
runCommandLocal
;
inherit (depot.nix.runTestsuite)
runTestsuite
it
assertEq
assertThrows
assertDoesNotThrow
;
inherit (depot.nix.writers)
rustSimple
;
inherit (depot.users.sterni.nix)
int
utf8
string
char
;
rustDecoder = rustSimple {
name = "utf8-decode";
} ''
use std::io::{self, Read};
fn main() -> std::io::Result<()> {
let mut buffer = String::new();
io::stdin().read_to_string(&mut buffer)?;
print!("[ ");
for c in buffer.chars() {
print!("{} ", u32::from(c));
}
print!("]");
Ok(())
}
'';
rustDecode = s:
let
expr = runCommandLocal "${s}-decoded" {} ''
printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
'';
in import expr;
hexDecode = l:
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
testFailures = it "checks UTF-8 decoding failures" [
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
# examples from The Unicode Standard
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
];
testAscii = it "checks decoding of ascii strings"
(builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
(string.toBytes s) (utf8.decode s)) [
"foo bar"
"hello\nworld"
"carriage\r\nreturn"
"1238398494829304 []<><>({})[]!!)"
(string.take 127 char.allChars)
]);
randomUnicode = [
"" # empty string should yield empty list
"🥰👨👨👧👦🐈👩🏽🦰"
# https://kermitproject.org/utf8.html
""
"An preost wes on leoden, Laȝamon was ihoten"
"Sîne klâwen durh die wolken sint geslagen,"
"Τ γλσσα μο δωσαν λληνικ"
"На берегу пустынных волн"
" "
"ி ிி ிி ி , "
" ಿ ಿ "
];
# https://kermitproject.org/utf8.html
glassSentences = [
"Euro Symbol: ."
"Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
"Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
"Polish: Mogę jeść szkło, i mi nie szkodzi."
"Romanian: Pot să mănânc sticlă și ea nu mă rănește."
"Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
"Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
"Georgian: ."
"Hindi: , ."
"Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
"Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
"Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
"Japanese: "
"Thai: "
];
testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
(builtins.map
(s: assertEq "Decoding of ${s} is correct" (utf8.decode s) (rustDecode s))
(lib.flatten [
glassSentences
randomUnicode
]));
testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
(builtins.map
(s: assertEq "Decoding and then encoding ${s} yields itself"
(utf8.encode (utf8.decode s)) s)
(lib.flatten [
glassSentences
randomUnicode
]));
in
runTestsuite "nix.utf8" [
testFailures
testAscii
testDecoding
testDecodingEncoding
]