2021-04-10 11:27:01 +02:00
|
|
|
|
{ depot, pkgs, lib, ... }:
|
2021-03-04 22:22:50 +01:00
|
|
|
|
|
|
|
|
|
let
|
|
|
|
|
|
2021-04-10 11:27:01 +02:00
|
|
|
|
inherit (pkgs)
|
2021-03-04 22:22:50 +01:00
|
|
|
|
runCommandLocal
|
|
|
|
|
;
|
|
|
|
|
|
|
|
|
|
inherit (depot.nix.runTestsuite)
|
|
|
|
|
runTestsuite
|
|
|
|
|
it
|
|
|
|
|
assertEq
|
|
|
|
|
assertThrows
|
|
|
|
|
assertDoesNotThrow
|
|
|
|
|
;
|
|
|
|
|
|
2021-04-23 22:04:42 +02:00
|
|
|
|
inherit (depot.nix.writers)
|
2021-03-04 22:22:50 +01:00
|
|
|
|
rustSimple
|
|
|
|
|
;
|
|
|
|
|
|
|
|
|
|
inherit (depot.users.sterni.nix)
|
|
|
|
|
int
|
|
|
|
|
utf8
|
|
|
|
|
string
|
|
|
|
|
char
|
|
|
|
|
;
|
|
|
|
|
|
2022-01-30 17:06:58 +01:00
|
|
|
|
rustDecoder = rustSimple
|
|
|
|
|
{
|
|
|
|
|
name = "utf8-decode";
|
|
|
|
|
} ''
|
2021-03-04 22:22:50 +01:00
|
|
|
|
use std::io::{self, Read};
|
|
|
|
|
fn main() -> std::io::Result<()> {
|
|
|
|
|
let mut buffer = String::new();
|
|
|
|
|
io::stdin().read_to_string(&mut buffer)?;
|
|
|
|
|
|
|
|
|
|
print!("[ ");
|
|
|
|
|
|
|
|
|
|
for c in buffer.chars() {
|
|
|
|
|
print!("{} ", u32::from(c));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
print!("]");
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
'';
|
|
|
|
|
|
|
|
|
|
rustDecode = s:
|
|
|
|
|
let
|
2022-01-30 17:06:58 +01:00
|
|
|
|
expr = runCommandLocal "${s}-decoded" { } ''
|
2021-03-04 22:22:50 +01:00
|
|
|
|
printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
|
|
|
|
|
'';
|
2022-01-30 17:06:58 +01:00
|
|
|
|
in
|
|
|
|
|
import expr;
|
2021-03-04 22:22:50 +01:00
|
|
|
|
|
|
|
|
|
hexDecode = l:
|
|
|
|
|
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
|
|
|
|
|
|
2021-11-23 19:58:15 +01:00
|
|
|
|
hexEncode = l: utf8.encode (builtins.map int.fromHex l);
|
|
|
|
|
|
|
|
|
|
testFailures = it "checks UTF-8 decoding failures" ([
|
2021-03-04 22:22:50 +01:00
|
|
|
|
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
|
|
|
|
|
# examples from The Unicode Standard
|
|
|
|
|
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
|
|
|
|
|
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
|
|
|
|
|
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
|
2021-11-23 19:58:15 +01:00
|
|
|
|
(assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
|
|
|
|
|
(assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
|
2022-01-30 17:06:58 +01:00
|
|
|
|
] ++ builtins.genList
|
|
|
|
|
(i:
|
|
|
|
|
let
|
|
|
|
|
cp = i + int.fromHex "D800";
|
|
|
|
|
in
|
2021-11-23 19:58:15 +01:00
|
|
|
|
assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
|
|
|
|
|
(utf8.encode [ cp ])
|
2022-01-30 17:06:58 +01:00
|
|
|
|
)
|
|
|
|
|
(int.fromHex "07FF"));
|
2021-03-04 22:22:50 +01:00
|
|
|
|
|
|
|
|
|
testAscii = it "checks decoding of ascii strings"
|
2022-01-30 17:06:58 +01:00
|
|
|
|
(builtins.map
|
|
|
|
|
(s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
|
|
|
|
|
(string.toBytes s)
|
|
|
|
|
(utf8.decode s)) [
|
|
|
|
|
"foo bar"
|
|
|
|
|
"hello\nworld"
|
|
|
|
|
"carriage\r\nreturn"
|
|
|
|
|
"1238398494829304 []<><>({})[]!!)"
|
|
|
|
|
(string.take 127 char.allChars)
|
|
|
|
|
]);
|
2021-03-04 22:22:50 +01:00
|
|
|
|
|
|
|
|
|
randomUnicode = [
|
2021-11-22 22:20:16 +01:00
|
|
|
|
"" # empty string should yield empty list
|
2021-03-04 22:22:50 +01:00
|
|
|
|
"🥰👨👨👧👦🐈⬛👩🏽🦰"
|
|
|
|
|
# https://kermitproject.org/utf8.html
|
|
|
|
|
"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ"
|
|
|
|
|
"An preost wes on leoden, Laȝamon was ihoten"
|
|
|
|
|
"Sîne klâwen durh die wolken sint geslagen,"
|
|
|
|
|
"Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴ"
|
|
|
|
|
"На берегу пустынных волн"
|
|
|
|
|
"ვეპხის ტყაოსანი შოთა რუსთაველი"
|
|
|
|
|
"யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம், "
|
|
|
|
|
"ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು "
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
# https://kermitproject.org/utf8.html
|
|
|
|
|
glassSentences = [
|
|
|
|
|
"Euro Symbol: €."
|
|
|
|
|
"Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
|
|
|
|
|
"Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
|
|
|
|
|
"Polish: Mogę jeść szkło, i mi nie szkodzi."
|
|
|
|
|
"Romanian: Pot să mănânc sticlă și ea nu mă rănește."
|
|
|
|
|
"Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
|
|
|
|
|
"Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
|
|
|
|
|
"Georgian: მინას ვჭამ და არა მტკივა."
|
|
|
|
|
"Hindi: मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."
|
|
|
|
|
"Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
|
|
|
|
|
"Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
|
|
|
|
|
"Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
|
|
|
|
|
"Japanese: 私はガラスを食べられます。それは私を傷つけません。"
|
|
|
|
|
"Thai: ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
|
|
|
|
|
(builtins.map
|
|
|
|
|
(s: assertEq "Decoding of “${s}” is correct" (utf8.decode s) (rustDecode s))
|
|
|
|
|
(lib.flatten [
|
|
|
|
|
glassSentences
|
|
|
|
|
randomUnicode
|
|
|
|
|
]));
|
|
|
|
|
|
2021-11-23 19:23:54 +01:00
|
|
|
|
testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
|
|
|
|
|
(builtins.map
|
|
|
|
|
(s: assertEq "Decoding and then encoding “${s}” yields itself"
|
2022-01-30 17:06:58 +01:00
|
|
|
|
(utf8.encode (utf8.decode s))
|
|
|
|
|
s)
|
2021-11-23 19:23:54 +01:00
|
|
|
|
(lib.flatten [
|
|
|
|
|
glassSentences
|
|
|
|
|
randomUnicode
|
|
|
|
|
]));
|
|
|
|
|
|
2021-03-04 22:22:50 +01:00
|
|
|
|
in
|
2022-01-30 17:06:58 +01:00
|
|
|
|
runTestsuite "nix.utf8" [
|
|
|
|
|
testFailures
|
|
|
|
|
testAscii
|
|
|
|
|
testDecoding
|
|
|
|
|
testDecodingEncoding
|
|
|
|
|
]
|