refactor(sterni/nix/utf8): use genericClosure for decoding iteration

builtins.genericClosure is a quite powerful (and undocumented) Nix
primop: It repeatedly applies a function to values it produces and
collects them into a list. Additionally individual results can be
identified via a key attribute.

Since genericClosure only ever creates a single list value internally,
we can eliminate a huge performance bottleneck when building a list in a
recursive algorithm: list concatenation. Because Nix needs to copy the
entire chunk of memory used internally to represent the list, building
big lists one element at a time grinds Nix to a halt.

After rewriting decode using genericClosure decoding the LaTeX source
of my 20 page term paper now takes 2s instead of 14min.

Change-Id: I33847e4e7dd95d7f4d78ac83eb0d74a9867bfe80
This commit is contained in:
sterni 2021-11-22 22:06:33 +01:00
parent a2be05faa4
commit 8615322bc8

View file

@ -160,31 +160,54 @@ let
# TODO(sterni): option to fallback to replacement char instead of failure
decode = s:
let
iter = { codes ? [], ... }@args: byte:
let
res = step args byte;
in
# foldl' forceValues the calculate value only at the end
# this makes the thunk grow large enough to cause a stack
# overflow with sufficiently large strings. To avoid this
# we always deepSeq the result which also keeps memory
# usage of decode reasonable.
builtins.deepSeq res
(if res ? result
then res // {
codes = codes ++ [ res.result ];
stringLength = builtins.stringLength s;
iterResult = builtins.genericClosure {
startSet = [
{
key = "start";
stringIndex = -1;
state = {};
codepoint = null;
}
else res);
iterResult =
builtins.foldl' iter {} (string.toChars s);
];
operator = { state, stringIndex, ... }:
let
# updated values for current iteration step
newIndex = stringIndex + 1;
newState = step state (builtins.substring newIndex 1 s);
in lib.optional (newIndex < stringLength) {
# unique keys to make genericClosure happy
key = toString newIndex;
# carryover state for the next step
stringIndex = newIndex;
state = newState;
# actual payload for later, steps with value null are filtered out
codepoint = newState.result or null;
};
};
in
# extract all steps that yield a code point into a list
builtins.map (v: v.codepoint) (
builtins.filter (
{ codepoint, stringIndex, state, ... }:
let
# error message in case we are missing bytes at the end of input
earlyEndMsg =
if iterResult ? count && iterResult ? pos
then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"
if state ? count && state ? pos
then "Missing ${toString (with state; count - pos)} bytes at end of input"
else "Unexpected end of input";
in
if iterResult ? result
then iterResult.codes
else builtins.throw earlyEndMsg;
# filter out all iteration steps without a codepoint value
codepoint != null
# if we are at the iteration step of the input string, throw
# an error if no codepoint was returned, as it indicates an incomplete
# UTF-8 sequence.
|| (stringIndex == stringLength - 1 && throw earlyEndMsg)
) iterResult
);
/* Decodes an UTF-8 string, but doesn't throw on error.
Instead it returns null.