feat(tvix/refscan): implement reference scanning over data streams
Using yet more machinery from the pretty comprehensive aho_corasick crate, this makes it possible to pass anything implementing `io::Read` to the `ReferenceScanner` to accumulate matches. Change-Id: I5b0e28eb44ea4df24010f40831e29f2cbb8c1f80 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7810 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Tested-by: BuildkiteCI
This commit is contained in:
parent
3045645df0
commit
e63bff5545
1 changed files with 40 additions and 0 deletions
|
@ -9,6 +9,7 @@
|
||||||
|
|
||||||
use aho_corasick::AhoCorasick;
|
use aho_corasick::AhoCorasick;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
use std::io;
|
||||||
|
|
||||||
/// Represents a "primed" reference scanner with an automaton that knows the set
|
/// Represents a "primed" reference scanner with an automaton that knows the set
|
||||||
/// of store paths to scan for.
|
/// of store paths to scan for.
|
||||||
|
@ -40,6 +41,22 @@ impl<'c, 's> ReferenceScanner<'c, 's> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Scan the given reader for all non-overlapping matches, and collect them
|
||||||
|
/// in the scanner. On read failures, this method aborts and returns an
|
||||||
|
/// error to the caller.
|
||||||
|
///
|
||||||
|
/// Please note that the internal machinery has its own buffering mechanism,
|
||||||
|
/// and where possible the given reader should be unbuffered. See
|
||||||
|
/// [`AhoCorasick::stream_find_iter`] for details on this.
|
||||||
|
pub fn scan_stream<R: io::Read>(&mut self, stream: R) -> io::Result<()> {
|
||||||
|
for m in self.searcher.stream_find_iter(stream) {
|
||||||
|
let needle = self.candidates[m?.pattern()];
|
||||||
|
self.matches.insert(needle);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Finalise the reference scanner and return the resulting matches.
|
/// Finalise the reference scanner and return the resulting matches.
|
||||||
pub fn finalise(self) -> BTreeSet<&'s str> {
|
pub fn finalise(self) -> BTreeSet<&'s str> {
|
||||||
self.matches
|
self.matches
|
||||||
|
@ -87,7 +104,30 @@ mod tests {
|
||||||
scanner.scan_str(HELLO_DRV);
|
scanner.scan_str(HELLO_DRV);
|
||||||
|
|
||||||
let result = scanner.finalise();
|
let result = scanner.finalise();
|
||||||
|
assert_eq!(result.len(), 3);
|
||||||
|
|
||||||
|
for c in candidates[..3].iter() {
|
||||||
|
assert!(result.contains(c));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_multiple_stream() {
|
||||||
|
let candidates = &[
|
||||||
|
// these exist in the drv:
|
||||||
|
"/nix/store/33l4p0pn0mybmqzaxfkpppyh7vx1c74p-hello-2.12.1",
|
||||||
|
"/nix/store/pf80kikyxr63wrw56k00i1kw6ba76qik-hello-2.12.1.tar.gz.drv",
|
||||||
|
"/nix/store/cp65c8nk29qq5cl1wyy5qyw103cwmax7-stdenv-linux",
|
||||||
|
// this doesn't:
|
||||||
|
"/nix/store/fn7zvafq26f0c8b17brs7s95s10ibfzs-emacs-28.2.drv",
|
||||||
|
];
|
||||||
|
|
||||||
|
let mut scanner = ReferenceScanner::new(candidates);
|
||||||
|
scanner
|
||||||
|
.scan_stream(HELLO_DRV.as_bytes())
|
||||||
|
.expect("scanning should succeed");
|
||||||
|
|
||||||
|
let result = scanner.finalise();
|
||||||
assert_eq!(result.len(), 3);
|
assert_eq!(result.len(), 3);
|
||||||
|
|
||||||
for c in candidates[..3].iter() {
|
for c in candidates[..3].iter() {
|
||||||
|
|
Loading…
Reference in a new issue