From 8615322bc81dfe106aa158b95567300372ea469d Mon Sep 17 00:00:00 2001 From: sterni Date: Mon, 22 Nov 2021 22:06:33 +0100 Subject: [PATCH] refactor(sterni/nix/utf8): use genericClosure for decoding iteration builtins.genericClosure is a quite powerful (and undocumented) Nix primop: It repeatedly applies a function to values it produces and collects them into a list. Additionally individual results can be identified via a key attribute. Since genericClosure only ever creates a single list value internally, we can eliminate a huge performance bottleneck when building a list in a recursive algorithm: list concatenation. Because Nix needs to copy the entire chunk of memory used internally to represent the list, building big lists one element at a time grinds Nix to a halt. After rewriting decode using genericClosure decoding the LaTeX source of my 20 page term paper now takes 2s instead of 14min. Change-Id: I33847e4e7dd95d7f4d78ac83eb0d74a9867bfe80 --- users/sterni/nix/utf8/default.nix | 71 ++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/users/sterni/nix/utf8/default.nix b/users/sterni/nix/utf8/default.nix index 713f1f57c..c89263cd8 100644 --- a/users/sterni/nix/utf8/default.nix +++ b/users/sterni/nix/utf8/default.nix @@ -160,31 +160,54 @@ let # TODO(sterni): option to fallback to replacement char instead of failure decode = s: let - iter = { codes ? [], ... }@args: byte: - let - res = step args byte; - in - # foldl' forceValues the calculate value only at the end - # this makes the thunk grow large enough to cause a stack - # overflow with sufficiently large strings. To avoid this - # we always deepSeq the result which also keeps memory - # usage of decode reasonable. - builtins.deepSeq res - (if res ? result - then res // { - codes = codes ++ [ res.result ]; - } - else res); - iterResult = - builtins.foldl' iter {} (string.toChars s); - earlyEndMsg = - if iterResult ? count && iterResult ? pos - then "Missing ${toString (with iterResult; count - pos)} bytes at end of input" - else "Unexpected end of input"; + stringLength = builtins.stringLength s; + iterResult = builtins.genericClosure { + startSet = [ + { + key = "start"; + stringIndex = -1; + state = {}; + codepoint = null; + } + ]; + operator = { state, stringIndex, ... }: + let + # updated values for current iteration step + newIndex = stringIndex + 1; + newState = step state (builtins.substring newIndex 1 s); + in lib.optional (newIndex < stringLength) { + # unique keys to make genericClosure happy + key = toString newIndex; + # carryover state for the next step + stringIndex = newIndex; + state = newState; + # actual payload for later, steps with value null are filtered out + codepoint = newState.result or null; + }; + }; in - if iterResult ? result - then iterResult.codes - else builtins.throw earlyEndMsg; + # extract all steps that yield a code point into a list + builtins.map (v: v.codepoint) ( + builtins.filter ( + { codepoint, stringIndex, state, ... }: + + let + # error message in case we are missing bytes at the end of input + earlyEndMsg = + if state ? count && state ? pos + then "Missing ${toString (with state; count - pos)} bytes at end of input" + else "Unexpected end of input"; + in + + # filter out all iteration steps without a codepoint value + codepoint != null + # if we are at the iteration step of the input string, throw + # an error if no codepoint was returned, as it indicates an incomplete + # UTF-8 sequence. + || (stringIndex == stringLength - 1 && throw earlyEndMsg) + + ) iterResult + ); /* Decodes an UTF-8 string, but doesn't throw on error. Instead it returns null.