750ef6c693
* Enforce the U+0000 to U+10FFFF range in `count` and throw an error if the given codepoint exceeds the range (encoding U+0000 won't work of course, but this is Nix's fault…). * Check if the produced bytes are well formed and output an error if not. This indicates that the codepoint can't be encoded as UTF-8, like U+D800 which is reserved for UTF-16. Change-Id: I18336e527484580f28cbfe784d51718ee15c5477
141 lines
4.8 KiB
Nix
141 lines
4.8 KiB
Nix
{ depot, pkgs, lib, ... }:
|
||
|
||
let
|
||
|
||
inherit (pkgs)
|
||
runCommandLocal
|
||
;
|
||
|
||
inherit (depot.nix.runTestsuite)
|
||
runTestsuite
|
||
it
|
||
assertEq
|
||
assertThrows
|
||
assertDoesNotThrow
|
||
;
|
||
|
||
inherit (depot.nix.writers)
|
||
rustSimple
|
||
;
|
||
|
||
inherit (depot.users.sterni.nix)
|
||
int
|
||
utf8
|
||
string
|
||
char
|
||
;
|
||
|
||
rustDecoder = rustSimple {
|
||
name = "utf8-decode";
|
||
} ''
|
||
use std::io::{self, Read};
|
||
fn main() -> std::io::Result<()> {
|
||
let mut buffer = String::new();
|
||
io::stdin().read_to_string(&mut buffer)?;
|
||
|
||
print!("[ ");
|
||
|
||
for c in buffer.chars() {
|
||
print!("{} ", u32::from(c));
|
||
}
|
||
|
||
print!("]");
|
||
|
||
Ok(())
|
||
}
|
||
'';
|
||
|
||
rustDecode = s:
|
||
let
|
||
expr = runCommandLocal "${s}-decoded" {} ''
|
||
printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
|
||
'';
|
||
in import expr;
|
||
|
||
hexDecode = l:
|
||
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
|
||
|
||
hexEncode = l: utf8.encode (builtins.map int.fromHex l);
|
||
|
||
testFailures = it "checks UTF-8 decoding failures" ([
|
||
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
|
||
# examples from The Unicode Standard
|
||
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
|
||
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
|
||
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
|
||
(assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
|
||
(assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
|
||
] ++ builtins.genList (i:
|
||
let
|
||
cp = i + int.fromHex "D800";
|
||
in
|
||
assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
|
||
(utf8.encode [ cp ])
|
||
) (int.fromHex "07FF"));
|
||
|
||
testAscii = it "checks decoding of ascii strings"
|
||
(builtins.map (s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
|
||
(string.toBytes s) (utf8.decode s)) [
|
||
"foo bar"
|
||
"hello\nworld"
|
||
"carriage\r\nreturn"
|
||
"1238398494829304 []<><>({})[]!!)"
|
||
(string.take 127 char.allChars)
|
||
]);
|
||
|
||
randomUnicode = [
|
||
"" # empty string should yield empty list
|
||
"🥰👨👨👧👦🐈⬛👩🏽🦰"
|
||
# https://kermitproject.org/utf8.html
|
||
"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
|
||
"An preost wes on leoden, Laȝamon was ihoten"
|
||
"Sîne klâwen durh die wolken sint geslagen,"
|
||
"Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ"
|
||
"На берегу пустынных волн"
|
||
"ვეპხის ტყაოსანი შოთა რუსთაველი"
|
||
"யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், "
|
||
"ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು "
|
||
];
|
||
|
||
# https://kermitproject.org/utf8.html
|
||
glassSentences = [
|
||
"Euro Symbol: €."
|
||
"Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
|
||
"Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
|
||
"Polish: Mogę jeść szkło, i mi nie szkodzi."
|
||
"Romanian: Pot să mănânc sticlă și ea nu mă rănește."
|
||
"Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
|
||
"Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
|
||
"Georgian: მინას ვჭამ და არა მტკივა."
|
||
"Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."
|
||
"Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
|
||
"Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
|
||
"Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
|
||
"Japanese: 私はガラスを食べられます。それは私を傷つけません。"
|
||
"Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "
|
||
];
|
||
|
||
testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
|
||
(builtins.map
|
||
(s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s))
|
||
(lib.flatten [
|
||
glassSentences
|
||
randomUnicode
|
||
]));
|
||
|
||
testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
|
||
(builtins.map
|
||
(s: assertEq "Decoding and then encoding “${s}” yields itself"
|
||
(utf8.encode (utf8.decode s)) s)
|
||
(lib.flatten [
|
||
glassSentences
|
||
randomUnicode
|
||
]));
|
||
|
||
in
|
||
runTestsuite "nix.utf8" [
|
||
testFailures
|
||
testAscii
|
||
testDecoding
|
||
testDecodingEncoding
|
||
]
|