feat(tazjin/rlox): Add a simple string interner
This is based on this matklad post: https://matklad.github.io/2020/03/22/fast-simple-rust-interner.html It's modified slightly to provide a safer interface and slightly more readable implementation: * interned string IDs are wrapped in a newtype that is not publicly constructible * unsafe block is reduced to only the small scope in which it is needed * lookup lifetime is pinned explicitly to make the intent clearer when reading this code Change-Id: Ia3dae988f33f8e5e7d8dc0c1a9216914a945b036 Reviewed-on: https://cl.tvl.fyi/c/depot/+/2578 Tested-by: BuildkiteCI Reviewed-by: tazjin <mail@tazj.in>
This commit is contained in:
parent
6f600c8300
commit
ef7a0da8cb
3 changed files with 112 additions and 0 deletions
87
users/tazjin/rlox/src/bytecode/interner/mod.rs
Normal file
87
users/tazjin/rlox/src/bytecode/interner/mod.rs
Normal file
|
@ -0,0 +1,87 @@
|
||||||
|
//! String-interning implementation for values that are likely to
|
||||||
|
//! benefit from fast comparisons and deduplication (e.g. instances of
|
||||||
|
//! variable names).
|
||||||
|
//!
|
||||||
|
//! This uses a trick from the typed-arena crate for guaranteeing
|
||||||
|
//! stable addresses by never resizing the existing String buffer, and
|
||||||
|
//! collecting full buffers in a vector.
|
||||||
|
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Hash)]
|
||||||
|
pub struct InternedStr {
|
||||||
|
id: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct Interner {
|
||||||
|
map: HashMap<&'static str, InternedStr>,
|
||||||
|
vec: Vec<&'static str>,
|
||||||
|
buf: String,
|
||||||
|
full: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Interner {
|
||||||
|
pub fn with_capacity(cap: usize) -> Self {
|
||||||
|
Interner {
|
||||||
|
buf: String::with_capacity(cap),
|
||||||
|
..Default::default()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn intern<S: AsRef<str>>(&mut self, name: S) -> InternedStr {
|
||||||
|
let name = name.as_ref();
|
||||||
|
if let Some(&id) = self.map.get(name) {
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
|
||||||
|
let name = self.alloc(name);
|
||||||
|
let id = InternedStr {
|
||||||
|
id: self.vec.len() as usize,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.map.insert(name, id);
|
||||||
|
self.vec.push(name);
|
||||||
|
|
||||||
|
debug_assert!(self.lookup(id) == name);
|
||||||
|
debug_assert!(self.intern(name) == id);
|
||||||
|
|
||||||
|
id
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn lookup<'a>(&'a self, id: InternedStr) -> &'a str {
|
||||||
|
self.vec[id.id]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn alloc<'a>(&'a mut self, name: &str) -> &'static str {
|
||||||
|
let cap = self.buf.capacity();
|
||||||
|
if cap < self.buf.len() + name.len() {
|
||||||
|
let new_cap = (cap.max(name.len()) + 1).next_power_of_two();
|
||||||
|
let new_buf = String::with_capacity(new_cap);
|
||||||
|
let old_buf = std::mem::replace(&mut self.buf, new_buf);
|
||||||
|
self.full.push(old_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
let interned: &'a str = {
|
||||||
|
let start = self.buf.len();
|
||||||
|
self.buf.push_str(name);
|
||||||
|
&self.buf[start..]
|
||||||
|
};
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
// This is sound for two reasons:
|
||||||
|
//
|
||||||
|
// 1. This function (Interner::alloc) is private, which
|
||||||
|
// prevents users from allocating a supposedly static
|
||||||
|
// reference.
|
||||||
|
//
|
||||||
|
// 2. Interner::lookup explicitly shortens the lifetime of
|
||||||
|
// references that are handed out to that of the
|
||||||
|
// reference to self.
|
||||||
|
return &*(interned as *const str);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
24
users/tazjin/rlox/src/bytecode/interner/tests.rs
Normal file
24
users/tazjin/rlox/src/bytecode/interner/tests.rs
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn interns_strings() {
|
||||||
|
let mut interner = Interner::with_capacity(128);
|
||||||
|
let id = interner.intern("hello world");
|
||||||
|
assert_eq!("hello world", interner.lookup(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn deduplicates_strings() {
|
||||||
|
let mut interner = Interner::with_capacity(128);
|
||||||
|
let id_1 = interner.intern("hello world");
|
||||||
|
let id_2 = interner.intern("hello world");
|
||||||
|
assert_eq!(id_1, id_2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn ids_survive_growing() {
|
||||||
|
let mut interner = Interner::with_capacity(16);
|
||||||
|
let id = interner.intern("hello");
|
||||||
|
interner.intern("excessively large string that will cause eallocation");
|
||||||
|
assert_eq!("hello", interner.lookup(id));
|
||||||
|
}
|
|
@ -5,6 +5,7 @@
|
||||||
mod chunk;
|
mod chunk;
|
||||||
mod compiler;
|
mod compiler;
|
||||||
mod errors;
|
mod errors;
|
||||||
|
mod interner;
|
||||||
mod opcode;
|
mod opcode;
|
||||||
mod value;
|
mod value;
|
||||||
mod vm;
|
mod vm;
|
||||||
|
|
Loading…
Reference in a new issue