feat(users/edef/refscan): high-performance Nix reference scanner
Research-grade code, treat with care. Change-Id: I99804df93e64101ef24928238ef0a8a02b59c2aa Reviewed-on: https://cl.tvl.fyi/c/depot/+/7686 Reviewed-by: edef <edef@edef.eu> Tested-by: BuildkiteCI
This commit is contained in:
parent
681800b438
commit
0b3c0725a2
7 changed files with 154 additions and 0 deletions
2
users/edef/refscan/.gitignore
vendored
Normal file
2
users/edef/refscan/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
**/*.rs.bk
|
25
users/edef/refscan/Cargo.lock
generated
Normal file
25
users/edef/refscan/Cargo.lock
generated
Normal file
|
@ -0,0 +1,25 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "packed_simd"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "refscan"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[metadata]
|
||||
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
||||
"checksum packed_simd 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a85ea9fc0d4ac0deb6fe7911d38786b32fc11119afd9e9d38b84ff691ce64220"
|
10
users/edef/refscan/Cargo.toml
Normal file
10
users/edef/refscan/Cargo.toml
Normal file
|
@ -0,0 +1,10 @@
|
|||
[package]
|
||||
name = "refscan"
|
||||
version = "0.1.0"
|
||||
authors = ["edef <edef@edef.eu>"]
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
packed_simd = "0.3.3"
|
53
users/edef/refscan/src/lib.rs
Normal file
53
users/edef/refscan/src/lib.rs
Normal file
|
@ -0,0 +1,53 @@
|
|||
use packed_simd::{m8x32, u8x32};
|
||||
|
||||
fn prefilter(haystack: u8x32) -> m8x32 {
|
||||
let alp = haystack.gt(u8x32::splat(b'a' - 1)) & haystack.lt(u8x32::splat(b'z' + 1));
|
||||
let num = haystack.gt(u8x32::splat(b'0' - 1)) & haystack.lt(u8x32::splat(b'9' + 1));
|
||||
alp | num
|
||||
}
|
||||
|
||||
/// scan_clean returns `Err(&buffer[..n])` of known pointer-free data,
|
||||
/// or `Ok(buffer)` if the entire buffer is pointer-free.
|
||||
pub fn scan_clean(buffer: &[u8]) -> Result<&[u8], &[u8]> {
|
||||
let buffer = {
|
||||
let n = buffer.len() & !31;
|
||||
&buffer[..n]
|
||||
};
|
||||
|
||||
let mut masks = buffer
|
||||
.chunks_exact(32)
|
||||
.map(|chunk| prefilter(u8x32::from_slice_unaligned(chunk)).bitmask())
|
||||
.enumerate()
|
||||
.map(|e| (e.0 * 32, e.1))
|
||||
.peekable();
|
||||
|
||||
while let Some((offset, mask)) = masks.next() {
|
||||
let peek = masks.peek().map(|x| x.1).unwrap_or(!0 >> 1);
|
||||
let n = (!mask).leading_zeros() + (!peek).trailing_zeros();
|
||||
if n >= 32 {
|
||||
let offset = offset + mask.trailing_zeros() as usize;
|
||||
return Err(&buffer[..offset]);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(buffer)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
#[test]
|
||||
fn scan_tail() {
|
||||
let buffer = b"_xfbmj7sl2ikicym9x3yq7cms5qx1w39k";
|
||||
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..1]));
|
||||
}
|
||||
#[test]
|
||||
fn scan_straddle() {
|
||||
let buffer = b"________________xfbmj7sl2ikicym9x3yq7cms5qx1w39k________________";
|
||||
assert_eq!(crate::scan_clean(buffer), Err(&buffer[..16]));
|
||||
}
|
||||
#[test]
|
||||
fn scan_clean() {
|
||||
let buffer = b"x_______________xfbmj7sl2ikicym9x3yq-cms5qx1w3-k________________";
|
||||
assert_eq!(crate::scan_clean(buffer), Ok(&buffer[..]));
|
||||
}
|
||||
}
|
55
users/edef/refscan/src/main.rs
Normal file
55
users/edef/refscan/src/main.rs
Normal file
|
@ -0,0 +1,55 @@
|
|||
use std::{
|
||||
collections::BTreeSet as Set,
|
||||
convert::TryInto,
|
||||
io::{self, Read},
|
||||
str,
|
||||
};
|
||||
|
||||
fn main() {
|
||||
let max_refs: Set<[u8; 32]> = include_str!("../testdata/maxrefs")
|
||||
.lines()
|
||||
.map(|l| l.as_bytes().try_into().unwrap())
|
||||
.collect();
|
||||
|
||||
let input = {
|
||||
let stdin = io::stdin();
|
||||
let mut buffer = Vec::new();
|
||||
stdin.lock().read_to_end(&mut buffer).unwrap();
|
||||
buffer
|
||||
};
|
||||
|
||||
let base = input.as_ptr() as usize;
|
||||
let mut input: &[u8] = &input;
|
||||
while input.len() >= 32 {
|
||||
match refscan::scan_clean(&input) {
|
||||
Ok(buffer) | Err(buffer) => {
|
||||
let n = buffer.len();
|
||||
input = &input[n..];
|
||||
}
|
||||
}
|
||||
|
||||
let buffer = {
|
||||
let idx = input.iter().position(|x| match x {
|
||||
b'a'..=b'z' | b'0'..=b'9' => false,
|
||||
_ => true,
|
||||
});
|
||||
idx.map(|idx| &input[..idx]).unwrap_or(input)
|
||||
};
|
||||
|
||||
for chunk in buffer.windows(32) {
|
||||
let offset = (chunk.as_ptr() as usize) - base;
|
||||
let chunk = {
|
||||
let mut fixed = [0u8; 32];
|
||||
fixed.copy_from_slice(chunk);
|
||||
fixed
|
||||
};
|
||||
if max_refs.contains(&chunk) {
|
||||
let seen = unsafe { str::from_utf8_unchecked(&chunk) };
|
||||
println!("{} {}", seen, offset);
|
||||
}
|
||||
}
|
||||
|
||||
let n = buffer.len();
|
||||
input = &input[n..];
|
||||
}
|
||||
}
|
3
users/edef/refscan/testdata/.gitignore
vendored
Normal file
3
users/edef/refscan/testdata/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
/maxrefs
|
||||
/nar
|
||||
/result
|
6
users/edef/refscan/testdata/generate.sh
vendored
Executable file
6
users/edef/refscan/testdata/generate.sh
vendored
Executable file
|
@ -0,0 +1,6 @@
|
|||
#! /usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
drv=$(nix-instantiate '<nixpkgs>' -A ghc)
|
||||
nix --extra-experimental-features nix-command show-derivation -r "$drv" | jq -r '.[] | .outputs[].path, .inputSrcs[]' | sort -u | cut -d/ -f4 | cut -d- -f1 > maxrefs
|
||||
nix-store --dump "$(nix-build "$drv")" > nar
|
Loading…
Reference in a new issue