tvl-depot/users/sterni/nix/utf8/default.nix

{ depot, lib, ... }:

let

  # TODO(sterni): encode

  inherit (depot.users.sterni.nix)
    char
    flow
    fun
    int
    string
    util
    ;

  /* (Internal) function to determine the amount
     bytes left in a UTF-8 byte sequence from the
     first byte.

     This function will throw if the given first
     byte is ill-formed, but will not detect all
     cases of ill-formed-ness.

     Based on table 3-6. from The Unicode Standard,
     Version 13.0, section 3.9.

     Type: integer -> integer
  */
  byteCount = i: flow.cond [
    [ (int.bitAnd i 128 == 0)   1 ]
    [ (int.bitAnd i 224 == 192) 2 ]
    [ (int.bitAnd i 240 == 224) 3 ]
    [ (int.bitAnd i 248 == 240) 4 ]
    [ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ]
  ];

  /* (Internal) function to check if a given byte in
     an UTF-8 byte sequence is well-formed.

     Based on table 3-7. from The Unicode Standard,
     Version 13.0, section 3.9.

     Throws if the first byte is invalid.

     Type: integer -> integer -> (integer -> bool)
  */
  wellFormedByte =
    # first byte's integer value
    first:
    # byte position as an index starting with 0
    pos:
      let
        defaultRange = int.inRange 128 191;
      in
        # The first byte is either ASCII which requires no checks
        # or we automatically check it when we check the subsequent
        # bytes. The downside is that this may generate bad error
        # messages in very rare cases.
        if pos == 0
        then lib.const true
        else if pos > 1 # 3rd and 4th byte have only one validity rule
        then defaultRange
        else assert pos == 1; flow.switch first [
          [ (int.inRange 194 223) defaultRange          ] # C2..DF
          [ 224                   (int.inRange 160 191) ] # E0
          [ (int.inRange 225 236) defaultRange          ] # E1..EC
          [ 237                   (int.inRange 128 159) ] # ED
          [ (int.inRange 238 239) defaultRange          ] # EE..EF
          [ 240                   (int.inRange 144 191) ] # F0
          [ (int.inRange 241 243) defaultRange          ] # F1..F3
          [ 244                   (int.inRange 128 143) ] # F4
          [
            (fun.const true)
            (builtins.throw "Invalid first byte ${int.toHex first}")
          ]
        ];

  /* Iteration step for decoding an UTF-8 byte sequence.
     It decodes incrementally, i. e. it has to be fed
     one byte at a time and then returns either a
     new state or a final result.

     If the resulting attribute set contains the attribute
     result, it is finished and the decoded codepoint is
     contained in that attribute. In all other cases,
     pass the returned set to step again along with
     a new byte. The initial state to pass is the empty
     set.

     Extra attributes are always passed through, so you
     can pass extra state. Be sure not to use result,
     pos, code, first or count.

     This function will throw with a fairly detailed
     message if it encounters ill-formed bytes.

     The implementation is based on The Unicode Standard,
     Version 13.0, section 3.9, especially table 3-6.

     Type: { ... } -> string -> ({ result :: integer, ... } | { ... })

     Example: utf8.step {} "f"
     => { result = 102; }
  */
  step = { pos ? 0, code ? 0, ... }@args: byte:
    let
      value = char.ord byte;
      # first byte is context for well-formed-ness
      first = args.first or value;
      count = args.count or (byteCount first);
      newCode =
        if count == 1
        then int.bitAnd 127 first # ascii character
        else # multi byte UTF-8 sequence
          let
            # Calculate the bitmask for extracting the
            # codepoint data in the current byte.
            # If the codepoint is not ASCII, the bits
            # used for codepoint data differ depending
            # on the byte position and overall byte
            # count. The first byte always ignores
            # the (count + 1) most significant bits.
            # For all subsequent bytes, the 2 most
            # significant bits need to be ignored.
            # See also table 3-6.
            mask =
              if pos == 0
              then int.exp 2 (8 - (count + 1)) - 1
              else 63;
            # UTF-8 uses the 6 least significant bits in all
            # subsequent bytes after the first one. Therefore
            # We can determine the amount we need to shift
            # the current value by the amount of bytes left.
            offset = (count - (pos + 1)) * 6;
          in
            code + (int.bitShiftL (int.bitAnd mask value) offset);
      illFormedMsg =
        "Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";
    in
      if !(wellFormedByte first pos value) then builtins.throw illFormedMsg
      else if pos + 1 == count
      then (builtins.removeAttrs args [ # allow extra state being passed through
        "count"
        "code"
        "pos"
        "first"
      ]) // { result = newCode; }
      else (builtins.removeAttrs args [ "result" ]) // {
        inherit count first;
        code = newCode;
        pos  = pos + 1;
      };

  /* Decode an UTF-8 string into a list of codepoints.

     Throws if the string is ill-formed UTF-8.

     Type: string -> [ integer ]
  */
  # TODO(sterni): option to fallback to replacement char instead of failure
  decode = s:
    let
      iter = { codes ? [], ... }@args: byte:
        let
          res = step args byte;
        in
          # foldl' forceValues the calculate value only at the end
          # this makes the thunk grow large enough to cause a stack
          # overflow with sufficiently large strings. To avoid this
          # we always deepSeq the result which also keeps memory
          # usage of decode reasonable.
          builtins.deepSeq res
            (if res ? result
            then res // {
              codes = codes ++ [ res.result ];
            }
            else res);
      iterResult =
        builtins.foldl' iter {} (string.toChars s);
      earlyEndMsg =
        if iterResult ? count && iterResult ? pos
        then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"
        else "Unexpected end of input";
    in
      if iterResult ? result
      then iterResult.codes
      else builtins.throw earlyEndMsg;

  /* Decodes an UTF-8 string, but doesn't throw on error.
     Instead it returns null.

     Type: string -> ( [ integer ] | null)
  */
  decodeSafe = s:
    let
      res = builtins.tryEval (decode s);
    in
      if res.success
      then res.value
      else null;

in {
  inherit
    decode
    decodeSafe
    step
    ;
}
feat(users/sterni/nix/utf8): pure nix utf-8 decoder users.sterni.nix.utf8 implements UTF-8 decoding in pure nix. We implement the decoding as a simple state machine which is fed one byte at a time. Decoding whole strings is possible by subsequently calling step. This is done in decode which uses builtins.foldl' to get around recursion restrictions and a neat trick using builtins.deepSeq puck showed me limiting the size of the thunks in a foldl' (which can also cause a stack overflow). This makes decoding arbitrarily large UTF-8 files into codepoints using nix theoretically possible, but it is not really practical: Decoding a 36KB LaTeX file I had lying around takes ~160s on my laptop. Change-Id: Iab8c973dac89074ec280b4880a7408e0b3d19bc7 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2590 Tested-by: BuildkiteCI Reviewed-by: sterni <sternenseemann@systemli.org> 2021-03-04 22:22:50 +01:00			`{ depot, lib, ... }:`

			`let`

			`# TODO(sterni): encode`

			`inherit (depot.users.sterni.nix)`
			`char`
			`flow`
			`fun`
			`int`
			`string`
			`util`
			`;`

			`/* (Internal) function to determine the amount`
			`bytes left in a UTF-8 byte sequence from the`
			`first byte.`

			`This function will throw if the given first`
			`byte is ill-formed, but will not detect all`
			`cases of ill-formed-ness.`

			`Based on table 3-6. from The Unicode Standard,`
			`Version 13.0, section 3.9.`

			`Type: integer -> integer`
			`*/`
			`byteCount = i: flow.cond [`
			`[ (int.bitAnd i 128 == 0) 1 ]`
			`[ (int.bitAnd i 224 == 192) 2 ]`
			`[ (int.bitAnd i 240 == 224) 3 ]`
			`[ (int.bitAnd i 248 == 240) 4 ]`
			`[ true (builtins.throw "Ill-formed first byte ${int.toHex i}") ]`
			`];`

			`/* (Internal) function to check if a given byte in`
			`an UTF-8 byte sequence is well-formed.`

			`Based on table 3-7. from The Unicode Standard,`
			`Version 13.0, section 3.9.`

			`Throws if the first byte is invalid.`

			`Type: integer -> integer -> (integer -> bool)`
			`*/`
			`wellFormedByte =`
			`# first byte's integer value`
			`first:`
			`# byte position as an index starting with 0`
			`pos:`
			`let`
			`defaultRange = int.inRange 128 191;`
			`in`
			`# The first byte is either ASCII which requires no checks`
			`# or we automatically check it when we check the subsequent`
			`# bytes. The downside is that this may generate bad error`
			`# messages in very rare cases.`
			`if pos == 0`
			`then lib.const true`
			`else if pos > 1 # 3rd and 4th byte have only one validity rule`
			`then defaultRange`
			`else assert pos == 1; flow.switch first [`
			`[ (int.inRange 194 223) defaultRange ] # C2..DF`
			`[ 224 (int.inRange 160 191) ] # E0`
			`[ (int.inRange 225 236) defaultRange ] # E1..EC`
			`[ 237 (int.inRange 128 159) ] # ED`
			`[ (int.inRange 238 239) defaultRange ] # EE..EF`
			`[ 240 (int.inRange 144 191) ] # F0`
			`[ (int.inRange 241 243) defaultRange ] # F1..F3`
			`[ 244 (int.inRange 128 143) ] # F4`
			`[`
			`(fun.const true)`
			`(builtins.throw "Invalid first byte ${int.toHex first}")`
			`]`
			`];`

			`/* Iteration step for decoding an UTF-8 byte sequence.`
			`It decodes incrementally, i. e. it has to be fed`
			`one byte at a time and then returns either a`
			`new state or a final result.`

			`If the resulting attribute set contains the attribute`
			`result, it is finished and the decoded codepoint is`
			`contained in that attribute. In all other cases,`
			`pass the returned set to step again along with`
			`a new byte. The initial state to pass is the empty`
			`set.`

			`Extra attributes are always passed through, so you`
			`can pass extra state. Be sure not to use result,`
			`pos, code, first or count.`

			`This function will throw with a fairly detailed`
			`message if it encounters ill-formed bytes.`

			`The implementation is based on The Unicode Standard,`
			`Version 13.0, section 3.9, especially table 3-6.`

			`Type: { ... } -> string -> ({ result :: integer, ... } \| { ... })`

			`Example: utf8.step {} "f"`
			`=> { result = 102; }`
			`*/`
			`step = { pos ? 0, code ? 0, ... }@args: byte:`
			`let`
			`value = char.ord byte;`
			`# first byte is context for well-formed-ness`
			`first = args.first or value;`
			`count = args.count or (byteCount first);`
			`newCode =`
			`if count == 1`
			`then int.bitAnd 127 first # ascii character`
			`else # multi byte UTF-8 sequence`
			`let`
			`# Calculate the bitmask for extracting the`
			`# codepoint data in the current byte.`
			`# If the codepoint is not ASCII, the bits`
			`# used for codepoint data differ depending`
			`# on the byte position and overall byte`
			`# count. The first byte always ignores`
			`# the (count + 1) most significant bits.`
			`# For all subsequent bytes, the 2 most`
			`# significant bits need to be ignored.`
			`# See also table 3-6.`
			`mask =`
			`if pos == 0`
			`then int.exp 2 (8 - (count + 1)) - 1`
			`else 63;`
			`# UTF-8 uses the 6 least significant bits in all`
			`# subsequent bytes after the first one. Therefore`
			`# We can determine the amount we need to shift`
			`# the current value by the amount of bytes left.`
			`offset = (count - (pos + 1)) * 6;`
			`in`
			`code + (int.bitShiftL (int.bitAnd mask value) offset);`
			`illFormedMsg =`
			`"Ill-formed byte ${int.toHex value} at position ${toString pos} in ${toString count} byte UTF-8 sequence";`
			`in`
			`if !(wellFormedByte first pos value) then builtins.throw illFormedMsg`
			`else if pos + 1 == count`
			`then (builtins.removeAttrs args [ # allow extra state being passed through`
			`"count"`
			`"code"`
			`"pos"`
			`"first"`
			`]) // { result = newCode; }`
			`else (builtins.removeAttrs args [ "result" ]) // {`
			`inherit count first;`
			`code = newCode;`
			`pos = pos + 1;`
			`};`

			`/* Decode an UTF-8 string into a list of codepoints.`

			`Throws if the string is ill-formed UTF-8.`

			`Type: string -> [ integer ]`
			`*/`
			`# TODO(sterni): option to fallback to replacement char instead of failure`
			`decode = s:`
			`let`
			`iter = { codes ? [], ... }@args: byte:`
			`let`
			`res = step args byte;`
			`in`
			`# foldl' forceValues the calculate value only at the end`
			`# this makes the thunk grow large enough to cause a stack`
			`# overflow with sufficiently large strings. To avoid this`
			`# we always deepSeq the result which also keeps memory`
			`# usage of decode reasonable.`
			`builtins.deepSeq res`
			`(if res ? result`
			`then res // {`
			`codes = codes ++ [ res.result ];`
			`}`
			`else res);`
			`iterResult =`
			`builtins.foldl' iter {} (string.toChars s);`
			`earlyEndMsg =`
			`if iterResult ? count && iterResult ? pos`
			`then "Missing ${toString (with iterResult; count - pos)} bytes at end of input"`
			`else "Unexpected end of input";`
			`in`
			`if iterResult ? result`
			`then iterResult.codes`
			`else builtins.throw earlyEndMsg;`

			`/* Decodes an UTF-8 string, but doesn't throw on error.`
			`Instead it returns null.`

			`Type: string -> ( [ integer ] \| null)`
			`*/`
			`decodeSafe = s:`
			`let`
			`res = builtins.tryEval (decode s);`
			`in`
			`if res.success`
			`then res.value`
			`else null;`

			`in {`
			`inherit`
			`decode`
			`decodeSafe`
			`step`
			`;`
			`}`