refactor(sterni/nix/utf8): use genericClosure for decoding iteration
builtins.genericClosure is a quite powerful (and undocumented) Nix primop: It repeatedly applies a function to values it produces and collects them into a list. Additionally individual results can be identified via a key attribute. Since genericClosure only ever creates a single list value internally, we can eliminate a huge performance bottleneck when building a list in a recursive algorithm: list concatenation. Because Nix needs to copy the entire chunk of memory used internally to represent the list, building big lists one element at a time grinds Nix to a halt. After rewriting decode using genericClosure decoding the LaTeX source of my 20 page term paper now takes 2s instead of 14min. Change-Id: I33847e4e7dd95d7f4d78ac83eb0d74a9867bfe80
This commit is contained in:
parent
a2be05faa4
commit
8615322bc8
1 changed files with 47 additions and 24 deletions
|
@ -160,31 +160,54 @@ let
|
|||
# TODO(sterni): option to fallback to replacement char instead of failure
|
||||
decode = s:
|
||||
let
|
||||
iter = { codes ? [], ... }@args: byte:
|
||||
let
|
||||
res = step args byte;
|
||||
in
|
||||
# foldl' forceValues the calculate value only at the end
|
||||
# this makes the thunk grow large enough to cause a stack
|
||||
# overflow with sufficiently large strings. To avoid this
|
||||
# we always deepSeq the result which also keeps memory
|
||||
# usage of decode reasonable.
|
||||
builtins.deepSeq res
|
||||
(if res ? result
|
||||
then res // {
|
||||
codes = codes ++ [ res.result ];
|
||||
stringLength = builtins.stringLength s;
|
||||
iterResult = builtins.genericClosure {
|
||||
startSet = [
|
||||
{
|
||||
key = "start";
|
||||
stringIndex = -1;
|
||||
state = {};
|
||||
codepoint = null;
|
||||
}
|
||||
else res);
|
||||
iterResult =
|
||||
builtins.foldl' iter {} (string.toChars s);
|
||||
];
|
||||
operator = { state, stringIndex, ... }:
|
||||
let
|
||||
# updated values for current iteration step
|
||||
newIndex = stringIndex + 1;
|
||||
newState = step state (builtins.substring newIndex 1 s);
|
||||
in lib.optional (newIndex < stringLength) {
|
||||
# unique keys to make genericClosure happy
|
||||
key = toString newIndex;
|
||||
# carryover state for the next step
|
||||
stringIndex = newIndex;
|
||||
state = newState;
|
||||
# actual payload for later, steps with value null are filtered out
|
||||
codepoint = newState.result or null;
|
||||
};
|
||||
};
|
||||
in
|
||||
# extract all steps that yield a code point into a list
|
||||
builtins.map (v: v.codepoint) (
|
||||
builtins.filter (
|
||||
{ codepoint, stringIndex, state, ... }:
|
||||
|
||||
let
|
||||
# error message in case we are missing bytes at the end of input
|
||||
earlyEndMsg =
|
||||
if iterResult ? count && iterResult ? pos
|
||||
then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"
|
||||
if state ? count && state ? pos
|
||||
then "Missing ${toString (with state; count - pos)} bytes at end of input"
|
||||
else "Unexpected end of input";
|
||||
in
|
||||
if iterResult ? result
|
||||
then iterResult.codes
|
||||
else builtins.throw earlyEndMsg;
|
||||
|
||||
# filter out all iteration steps without a codepoint value
|
||||
codepoint != null
|
||||
# if we are at the iteration step of the input string, throw
|
||||
# an error if no codepoint was returned, as it indicates an incomplete
|
||||
# UTF-8 sequence.
|
||||
|| (stringIndex == stringLength - 1 && throw earlyEndMsg)
|
||||
|
||||
) iterResult
|
||||
);
|
||||
|
||||
/* Decodes an UTF-8 string, but doesn't throw on error.
|
||||
Instead it returns null.
|
||||
|
|
Loading…
Reference in a new issue