feat(users/edef/refscan): high-performance Nix reference scanner

Research-grade code, treat with care.

Change-Id: I99804df93e64101ef24928238ef0a8a02b59c2aa
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7686
Reviewed-by: edef <edef@edef.eu>
Tested-by: BuildkiteCI
This commit is contained in:
edef 2023-01-09 20:12:26 +00:00
parent 681800b438
commit 0b3c0725a2
7 changed files with 154 additions and 0 deletions

2
users/edef/refscan/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
**/*.rs.bk

25
users/edef/refscan/Cargo.lock generated Normal file
View file

@ -0,0 +1,25 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
[[package]]
name = "cfg-if"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "packed_simd"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "refscan"
version = "0.1.0"
dependencies = [
"packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a85ea9fc0d4ac0deb6fe7911d38786b32fc11119afd9e9d38b84ff691ce64220"

View file

@ -0,0 +1,10 @@
[package]
name = "refscan"
version = "0.1.0"
authors = ["edef <edef@edef.eu>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
packed_simd = "0.3.3"

View file

@ -0,0 +1,53 @@
use packed_simd::{m8x32, u8x32};
fn prefilter(haystack: u8x32) -> m8x32 {
let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
alp | num
}
/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
/// or `Ok(buffer)` if the entire buffer is pointer-free.
pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
let buffer = {
let n = buffer.len() & !31;
&buffer[..n]
};
let mut masks = buffer
.chunks_exact(32)
.map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)).bitmask())
.enumerate()
.map(|e| (e.0 * 32, e.1))
.peekable();
while let Some((offset, mask)) = masks.next() {
let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
if n >= 32 {
let offset = offset + mask.trailing_zeros() as usize;
return Err(&buffer[..offset]);
}
}
Ok(buffer)
}
#[cfg(test)]
mod test {
#[test]
fn scan_tail() {
let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
}
#[test]
fn scan_straddle() {
let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
}
#[test]
fn scan_clean() {
let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
}
}

View file

@ -0,0 +1,55 @@
use std::{
collections::BTreeSet as Set,
convert::TryInto,
io::{self, Read},
str,
};
fn main() {
let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
.lines()
.map(|l| l.as_bytes().try_into().unwrap())
.collect();
let input = {
let stdin = io::stdin();
let mut buffer = Vec::new();
stdin.lock().read_to_end(&mut buffer).unwrap();
buffer
};
let base = input.as_ptr() as usize;
let mut input: &[u8] = &input;
while input.len() >= 32 {
match refscan::scan_clean(&input) {
Ok(buffer) | Err(buffer) => {
let n = buffer.len();
input = &input[n..];
}
}
let buffer = {
let idx = input.iter().position(|x| match x {
b'a'..=b'z' | b'0'..=b'9' => false,
_ => true,
});
idx.map(|idx| &input[..idx]).unwrap_or(input)
};
for chunk in buffer.windows(32) {
let offset = (chunk.as_ptr() as usize) - base;
let chunk = {
let mut fixed = [0u8; 32];
fixed.copy_from_slice(chunk);
fixed
};
if max_refs.contains(&chunk) {
let seen = unsafe { str::from_utf8_unchecked(&chunk) };
println!("{} {}", seen, offset);
}
}
let n = buffer.len();
input = &input[n..];
}
}

View file

@ -0,0 +1,3 @@
/maxrefs
/nar
/result

6
users/edef/refscan/testdata/generate.sh vendored Executable file
View file

@ -0,0 +1,6 @@
#! /usr/bin/env bash
set -euo pipefail
drv=$(nix-instantiate '<nixpkgs>' -A ghc)
nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
nix-store --dump "$(nix-build "$drv")" > nar