tvl-depot/users/sterni/nix/utf8/tests/default.nix
Vincent Ambo aa122cbae7 style: format entire depot with nixpkgs-fmt
This CL can be used to compare the style of nixpkgs-fmt against other
formatters (nixpkgs, alejandra).

Change-Id: I87c6abff6bcb546b02ead15ad0405f81e01b6d9e
Reviewed-on: https://cl.tvl.fyi/c/depot/+/4397
Tested-by: BuildkiteCI
Reviewed-by: sterni <sternenseemann@systemli.org>
Reviewed-by: lukegb <lukegb@tvl.fyi>
Reviewed-by: wpcarro <wpcarro@gmail.com>
Reviewed-by: Profpatsch <mail@profpatsch.de>
Reviewed-by: kanepyork <rikingcoding@gmail.com>
Reviewed-by: tazjin <tazjin@tvl.su>
Reviewed-by: cynthia <cynthia@tvl.fyi>
Reviewed-by: edef <edef@edef.eu>
Reviewed-by: eta <tvl@eta.st>
Reviewed-by: grfn <grfn@gws.fyi>
2022-01-31 16:11:53 +00:00

148 lines
4.8 KiB
Nix
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{ depot, pkgs, lib, ... }:
let
inherit (pkgs)
runCommandLocal
;
inherit (depot.nix.runTestsuite)
runTestsuite
it
assertEq
assertThrows
assertDoesNotThrow
;
inherit (depot.nix.writers)
rustSimple
;
inherit (depot.users.sterni.nix)
int
utf8
string
char
;
rustDecoder = rustSimple
{
name = "utf8-decode";
} ''
use std::io::{self, Read};
fn main() -> std::io::Result<()> {
let mut buffer = String::new();
io::stdin().read_to_string(&mut buffer)?;
print!("[ ");
for c in buffer.chars() {
print!("{} ", u32::from(c));
}
print!("]");
Ok(())
}
'';
rustDecode = s:
let
expr = runCommandLocal "${s}-decoded" { } ''
printf '%s' ${lib.escapeShellArg s} | ${rustDecoder} > $out
'';
in
import expr;
hexDecode = l:
utf8.decode (string.fromBytes (builtins.map int.fromHex l));
hexEncode = l: utf8.encode (builtins.map int.fromHex l);
testFailures = it "checks UTF-8 decoding failures" ([
(assertThrows "truncated UTF-8 string throws" (hexDecode [ "F0" "9F" ]))
# examples from The Unicode Standard
(assertThrows "ill-formed: C0 AF" (hexDecode [ "C0" "AF" ]))
(assertThrows "ill-formed: E0 9F 80" (hexDecode [ "E0" "9F" "80" ]))
(assertEq "well-formed: F4 80 83 92" (hexDecode [ "F4" "80" "83" "92" ]) [ 1048786 ])
(assertThrows "Codepoint out of range: 0xFFFFFF" (hexEncode [ "FFFFFF" ]))
(assertThrows "Codepoint out of range: -0x02" (hexEncode [ "-02" ]))
] ++ builtins.genList
(i:
let
cp = i + int.fromHex "D800";
in
assertThrows "Can't encode UTF-16 reserved characters: ${utf8.formatCodepoint cp}"
(utf8.encode [ cp ])
)
(int.fromHex "07FF"));
testAscii = it "checks decoding of ascii strings"
(builtins.map
(s: assertEq "ASCII decoding is equal to UTF-8 decoding for \"${s}\""
(string.toBytes s)
(utf8.decode s)) [
"foo bar"
"hello\nworld"
"carriage\r\nreturn"
"1238398494829304 []<><>({})[]!!)"
(string.take 127 char.allChars)
]);
randomUnicode = [
"" # empty string should yield empty list
"🥰👨👨👧👦🐈👩🏽🦰"
# https://kermitproject.org/utf8.html
""
"An preost wes on leoden, Laȝamon was ihoten"
"Sîne klâwen durh die wolken sint geslagen,"
"Τ γλσσα μο δωσαν λληνικ"
"На берегу пустынных волн"
" "
"ி ிி ிி ி , "
" ಿ ಿ "
];
# https://kermitproject.org/utf8.html
glassSentences = [
"Euro Symbol: ."
"Greek: Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."
"Íslenska / Icelandic: Ég get etið gler án þess að meiða mig."
"Polish: Mogę jeść szkło, i mi nie szkodzi."
"Romanian: Pot să mănânc sticlă și ea nu mă rănește."
"Ukrainian: Я можу їсти шкло, й воно мені не пошкодить."
"Armenian: Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"
"Georgian: ."
"Hindi: , ."
"Hebrew(2): אני יכול לאכול זכוכית וזה לא מזיק לי."
"Yiddish(2): איך קען עסן גלאָז און עס טוט מיר נישט װײ."
"Arabic(2): أنا قادر على أكل الزجاج و هذا لا يؤلمني."
"Japanese: "
"Thai: "
];
testDecoding = it "checks decoding of UTF-8 strings against Rust's String"
(builtins.map
(s: assertEq "Decoding of ${s} is correct" (utf8.decode s) (rustDecode s))
(lib.flatten [
glassSentences
randomUnicode
]));
testDecodingEncoding = it "checks that decoding and then encoding forms an identity"
(builtins.map
(s: assertEq "Decoding and then encoding ${s} yields itself"
(utf8.encode (utf8.decode s))
s)
(lib.flatten [
glassSentences
randomUnicode
]));
in
runTestsuite "nix.utf8" [
testFailures
testAscii
testDecoding
testDecodingEncoding
]