feat(users/edef/weave): ingest roots in Parquet format

Parsing of store-paths.xz is now handled by //users/edef/fetchroots.

Change-Id: I78be5aada0c0a321ed79d80c9b615e5f997ac3e0
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12670
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
edef 2024-10-17 13:26:01 +00:00
parent 313899c291
commit 06d2536eec

View file

@ -1,4 +1,4 @@
//! Weave resolves a list of roots from `nixpkgs.roots` against `narinfo.parquet`,
//! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
//! and then uses the reference graph from the accompanying `narinfo-references.parquet`
//! produced by `swizzle` to collect the closure of the roots.
//!
@ -7,11 +7,10 @@
use anyhow::Result;
use hashbrown::{hash_table, HashTable};
use nix_compat::nixbase32;
use rayon::prelude::*;
use std::{
collections::{BTreeMap, HashSet},
fs::{self, File},
fs::File,
ops::Index,
sync::atomic::{AtomicU32, Ordering},
};
@ -19,22 +18,24 @@ use std::{
use polars::{
datatypes::StaticArray,
export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
lazy::dsl::col,
prelude::*,
};
use weave::{hash64, DONE, INDEX_NULL};
use weave::{as_fixed_binary, hash64, DONE, INDEX_NULL};
fn main() -> Result<()> {
eprint!("… parse roots\r");
let roots: PathSet32 = {
let mut roots = Vec::new();
fs::read("nixpkgs.roots")?
.par_chunks_exact(32 + 1)
.map(|e| nixbase32::decode_fixed::<20>(&e[0..32]).unwrap())
.collect_into_vec(&mut roots);
roots.iter().collect()
};
let roots: PathSet32 = as_fixed_binary::<20>(
LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
.explode([col("store_path_hash")])
.select([col("store_path_hash")])
.collect()?
.column("store_path_hash")?
.binary()?,
)
.flatten()
.collect();
eprintln!("{DONE}");
{
@ -182,6 +183,7 @@ impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
this.insert(item);
}
this.table.shrink_to_fit(|(x, _)| hash64(x));
this
}
}