feat(users/edef/weave): ingest roots in Parquet format

Parsing of store-paths.xz is now handled by //users/edef/fetchroots.

Change-Id: I78be5aada0c0a321ed79d80c9b615e5f997ac3e0
Reviewed-on: https://cl.tvl.fyi/c/depot/+/12670
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
This commit is contained in:
edef 2024-10-17 13:26:01 +00:00
parent 313899c291
commit 06d2536eec

View file

@ -1,4 +1,4 @@
//! Weave resolves a list of roots from `nixpkgs.roots` against `narinfo.parquet`, //! Weave resolves a list of roots from `releases.parquet` against `narinfo.parquet`,
//! and then uses the reference graph from the accompanying `narinfo-references.parquet` //! and then uses the reference graph from the accompanying `narinfo-references.parquet`
//! produced by `swizzle` to collect the closure of the roots. //! produced by `swizzle` to collect the closure of the roots.
//! //!
@ -7,11 +7,10 @@
use anyhow::Result; use anyhow::Result;
use hashbrown::{hash_table, HashTable}; use hashbrown::{hash_table, HashTable};
use nix_compat::nixbase32;
use rayon::prelude::*; use rayon::prelude::*;
use std::{ use std::{
collections::{BTreeMap, HashSet}, collections::{BTreeMap, HashSet},
fs::{self, File}, fs::File,
ops::Index, ops::Index,
sync::atomic::{AtomicU32, Ordering}, sync::atomic::{AtomicU32, Ordering},
}; };
@ -19,22 +18,24 @@ use std::{
use polars::{ use polars::{
datatypes::StaticArray, datatypes::StaticArray,
export::arrow::{array::UInt32Array, offset::OffsetsBuffer}, export::arrow::{array::UInt32Array, offset::OffsetsBuffer},
lazy::dsl::col,
prelude::*, prelude::*,
}; };
use weave::{hash64, DONE, INDEX_NULL}; use weave::{as_fixed_binary, hash64, DONE, INDEX_NULL};
fn main() -> Result<()> { fn main() -> Result<()> {
eprint!("… parse roots\r"); eprint!("… parse roots\r");
let roots: PathSet32 = { let roots: PathSet32 = as_fixed_binary::<20>(
let mut roots = Vec::new(); LazyFrame::scan_parquet("releases.parquet", ScanArgsParquet::default())?
fs::read("nixpkgs.roots")? .explode([col("store_path_hash")])
.par_chunks_exact(32 + 1) .select([col("store_path_hash")])
.map(|e| nixbase32::decode_fixed::<20>(&e[0..32]).unwrap()) .collect()?
.collect_into_vec(&mut roots); .column("store_path_hash")?
.binary()?,
roots.iter().collect() )
}; .flatten()
.collect();
eprintln!("{DONE}"); eprintln!("{DONE}");
{ {
@ -182,6 +183,7 @@ impl<'a> FromIterator<&'a [u8; 20]> for PathSet32 {
this.insert(item); this.insert(item);
} }
this.table.shrink_to_fit(|(x, _)| hash64(x));
this this
} }
} }