tvl-depot/tvix/eval/src/builtins/to_xml.rs

301 lines
10 KiB
Rust
Raw Normal View History

//! This module implements `builtins.toXML`, which is a serialisation
//! of value information as well as internal tvix state that several
//! things in nixpkgs rely on.
fix(tvix): Represent strings as byte arrays C++ nix uses C-style zero-terminated char pointers to represent strings internally - however, up to this point, tvix has used Rust `String` and `str` for string values. Since those are required to be valid utf-8, we haven't been able to properly represent all the string values that Nix supports. To fix that, this change converts the internal representation of the NixString struct from `Box<str>` to `BString`, from the `bstr` crate - this is a wrapper around a `Vec<u8>` with extra functions for treating that byte vector as a "morally string-like" value, which is basically exactly what we need. Since this changes a pretty fundamental assumption about a pretty core type, there are a *lot* of changes in a lot of places to make this work, but I've tried to keep the general philosophy and intent of most of the code in most places intact. Most notably, there's nothing that's been done to make the derivation stuff in //tvix/glue work with non-utf8 strings everywhere, instead opting to just convert to String/str when passing things into that - there *might* be something to be done there, but I don't know what the rules should be and I don't want to figure them out in this change. To deal with OS-native paths in a way that also works in WASM for tvixbolt, this also adds a dependency on the "os_str_bytes" crate. Fixes: b/189 Fixes: b/337 Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200 Reviewed-by: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Reviewed-by: sterni <sternenseemann@systemli.org> Autosubmit: aspen <root@gws.fyi> Tested-by: BuildkiteCI
2023-12-05 17:25:52 -05:00
use bstr::ByteSlice;
use std::borrow::Cow;
use std::{io::Write, rc::Rc};
use crate::{ErrorKind, Value};
/// Recursively serialise a value to XML. The value *must* have been
/// deep-forced before being passed to this function.
pub fn value_to_xml<W: Write>(mut writer: W, value: &Value) -> Result<(), ErrorKind> {
// Write a literal document declaration, using C++-Nix-style
// single quotes.
writeln!(writer, "<?xml version='1.0' encoding='utf-8'?>")?;
let mut emitter = XmlEmitter::new(writer);
emitter.write_open_tag("expr", &[])?;
value_variant_to_xml(&mut emitter, value)?;
emitter.write_closing_tag("expr")?;
Ok(())
}
fn write_typed_value<W: Write, V: ToString>(
w: &mut XmlEmitter<W>,
name_unescaped: &str,
value: V,
) -> Result<(), ErrorKind> {
w.write_self_closing_tag(name_unescaped, &[("value", &value.to_string())])?;
Ok(())
}
fn value_variant_to_xml<W: Write>(w: &mut XmlEmitter<W>, value: &Value) -> Result<(), ErrorKind> {
match value {
Value::Thunk(t) => return value_variant_to_xml(w, &t.value()),
Value::Null => {
w.write_open_tag("null", &[])?;
w.write_closing_tag("null")?;
}
Value::Bool(b) => return write_typed_value(w, "bool", b),
Value::Integer(i) => return write_typed_value(w, "int", i),
Value::Float(f) => return write_typed_value(w, "float", f),
fix(tvix): Represent strings as byte arrays C++ nix uses C-style zero-terminated char pointers to represent strings internally - however, up to this point, tvix has used Rust `String` and `str` for string values. Since those are required to be valid utf-8, we haven't been able to properly represent all the string values that Nix supports. To fix that, this change converts the internal representation of the NixString struct from `Box<str>` to `BString`, from the `bstr` crate - this is a wrapper around a `Vec<u8>` with extra functions for treating that byte vector as a "morally string-like" value, which is basically exactly what we need. Since this changes a pretty fundamental assumption about a pretty core type, there are a *lot* of changes in a lot of places to make this work, but I've tried to keep the general philosophy and intent of most of the code in most places intact. Most notably, there's nothing that's been done to make the derivation stuff in //tvix/glue work with non-utf8 strings everywhere, instead opting to just convert to String/str when passing things into that - there *might* be something to be done there, but I don't know what the rules should be and I don't want to figure them out in this change. To deal with OS-native paths in a way that also works in WASM for tvixbolt, this also adds a dependency on the "os_str_bytes" crate. Fixes: b/189 Fixes: b/337 Change-Id: I5e6eb29c62f47dd91af954f5e12bfc3d186f5526 Reviewed-on: https://cl.tvl.fyi/c/depot/+/10200 Reviewed-by: tazjin <tazjin@tvl.su> Reviewed-by: flokli <flokli@flokli.de> Reviewed-by: sterni <sternenseemann@systemli.org> Autosubmit: aspen <root@gws.fyi> Tested-by: BuildkiteCI
2023-12-05 17:25:52 -05:00
Value::String(s) => return write_typed_value(w, "string", s.to_str()?),
Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()),
Value::List(list) => {
w.write_open_tag("list", &[])?;
for elem in list.into_iter() {
value_variant_to_xml(w, elem)?;
}
w.write_closing_tag("list")?;
}
Value::Attrs(attrs) => {
w.write_open_tag("attrs", &[])?;
for elem in attrs.iter() {
w.write_open_tag("attr", &[("name", &elem.0.to_str_lossy())])?;
value_variant_to_xml(w, elem.1)?;
w.write_closing_tag("attr")?;
}
w.write_closing_tag("attrs")?;
}
Value::Closure(c) => {
w.write_open_tag("function", &[])?;
match &c.lambda.formals {
Some(formals) => {
let mut attrs: Vec<(&str, &str)> = Vec::with_capacity(2);
if formals.ellipsis {
attrs.push(("ellipsis", "1"));
}
if let Some(ref name) = &formals.name {
attrs.push(("name", name.as_str()));
}
w.write_open_tag("attrspat", &attrs)?;
for arg in formals.arguments.iter() {
w.write_self_closing_tag("attr", &[("name", &arg.0.to_str_lossy())])?;
}
w.write_closing_tag("attrspat")?;
}
None => {
// TODO(tazjin): tvix does not currently persist function
// argument names anywhere (whereas we do for formals, as
// that is required for other runtime behaviour). Because of
// this the implementation here is fake, always returning
// the same argument name.
//
// If we don't want to persist the data, we can re-parse the
// AST from the spans of the lambda's bytecode and figure it
// out that way, but it needs some investigating.
w.write_self_closing_tag("varpat", &[("name", /* fake: */ "x")])?;
}
}
w.write_closing_tag("function")?;
}
Value::Builtin(_) => {
w.write_open_tag("unevaluated", &[])?;
w.write_closing_tag("unevaluated")?;
}
Value::AttrNotFound
| Value::Blueprint(_)
| Value::DeferredUpvalue(_)
| Value::UnresolvedPath(_)
| Value::Json(..)
fix(tvix/eval): only finalise formal arguments if defaulting When dealing with a formal argument in a function argument pattern that has a default expression, there are two different things that can happen at runtime: Either we select its value from the passed attribute successfully or we need to use the default expression. Both of these may be thunks and both of these may need finalisers. However, in the former case this is taken care of elsewhere, the value will always be finalised already if necessary. In the latter case we may need to finalise the thunk resulting from the default expression. However, the thunk corresponding to the expression may never end up in the local's stack slot. Since finalisation goes by stack slot (and not constants), we need to prevent a case where we don't fall back to the default expression, but finalise anyways. Previously, we worked around this by making `OpFinalise` ignore non-thunks. Since finalisation of already evaluated thunks still crashed, the faulty compilation of function pattern arguments could still cause a crash. As a new approach, we reinstate the old behavior of `OpFinalise` to crash whenever encountering something that is either not a thunk or doesn't need finalisation. This can also help catching (similar) miscompilations in the future. To then prevent the crash, we need to track whether we have fallen back or not at runtime. This is done using an additional phantom on the stack that holds a new `FinaliseRequest` value. When it comes to finalisation we check this value and conditionally execute `OpFinalise` based on its value. Resolves b/261 and b/265 (partially). Change-Id: Ic04fb80ec671a2ba11fa645090769c335fb7f58b Reviewed-on: https://cl.tvl.fyi/c/depot/+/8705 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI Autosubmit: sterni <sternenseemann@systemli.org>
2023-06-03 02:10:31 +02:00
| Value::FinaliseRequest(_) => {
return Err(ErrorKind::TvixBug {
msg: "internal value variant encountered in builtins.toXML",
metadata: Some(Rc::new(value.clone())),
})
}
Value::Catchable(_) => {
panic!("tvix bug: value_to_xml() called on a value which had not been deep-forced")
}
};
Ok(())
}
/// A simple-stupid XML emitter, which implements only the subset needed for byte-by-byte compat with C++ nix `builtins.toXML`.
struct XmlEmitter<W> {
/// The current indentation
cur_indent: usize,
writer: W,
}
impl<W: Write> XmlEmitter<W> {
pub fn new(writer: W) -> Self {
XmlEmitter {
cur_indent: 0,
writer,
}
}
/// Write an open tag with the given name (which is not escaped!)
/// and attributes (Keys are not escaped! Only attribute values are.)
pub fn write_open_tag(
&mut self,
name_unescaped: &str,
attrs: &[(&str, &str)],
) -> std::io::Result<()> {
self.add_indent()?;
self.writer.write_all(b"<")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.write_attrs_escape_vals(attrs)?;
self.writer.write_all(b">\n")?;
self.cur_indent += 2;
Ok(())
}
/// Write a self-closing open tag with the given name (which is not escaped!)
/// and attributes (Keys are not escaped! Only attribute values are.)
pub fn write_self_closing_tag(
&mut self,
name_unescaped: &str,
attrs: &[(&str, &str)],
) -> std::io::Result<()> {
self.add_indent()?;
self.writer.write_all(b"<")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.write_attrs_escape_vals(attrs)?;
self.writer.write_all(b" />\n")?;
Ok(())
}
/// Write a closing tag with the given name (which is not escaped!)
pub fn write_closing_tag(&mut self, name_unescaped: &str) -> std::io::Result<()> {
self.cur_indent -= 2;
self.add_indent()?;
self.writer.write_all(b"</")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.writer.write_all(b">\n")?;
Ok(())
}
#[inline]
fn add_indent(&mut self) -> std::io::Result<()> {
self.writer.write_all(&b" ".repeat(self.cur_indent))
}
/// Write an attribute list
fn write_attrs_escape_vals(&mut self, attrs: &[(&str, &str)]) -> std::io::Result<()> {
for (name, val) in attrs {
self.writer.write_all(b" ")?;
self.writer.write_all(name.as_bytes())?;
self.writer.write_all(br#"=""#)?;
self.writer
.write_all(Self::escape_attr_value(val).as_bytes())?;
self.writer.write_all(b"\"")?;
}
Ok(())
}
/// Escape the given attribute value, making sure we only actually clone the string if we needed to replace something.
fn escape_attr_value(s: &str) -> Cow<str> {
let mut last_escape: usize = 0;
let mut res: Cow<str> = Cow::Borrowed("");
// iterating via char_indices gives us the ability to index the original string slice at character boundaries
for (idx, c) in s.char_indices() {
match Self::should_escape_char(c) {
None => {}
Some(new) => {
// add characters since the last escape we did
res += &s[last_escape..idx];
// add the escaped value
res += new;
last_escape = idx + 1;
}
}
}
// we did not need to escape anything, so borrow original string
if last_escape == 0 {
Cow::Borrowed(s)
} else {
// add the remaining characters
res += &s[last_escape..];
res
}
}
fn should_escape_char(c: char) -> Option<&'static str> {
match c {
'<' => Some("&lt;"),
'>' => Some("&gt;"),
'"' => Some("&quot;"),
'\'' => Some("&apos;"),
'&' => Some("&amp;"),
'\n' => Some("&#xA;"),
'\r' => Some("&#xD;"),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use bytes::buf::Writer;
use pretty_assertions::assert_eq;
use crate::builtins::to_xml::XmlEmitter;
use std::borrow::Cow;
#[test]
fn xml_gen() {
let mut buf = Vec::new();
let mut x = XmlEmitter::new(&mut buf);
x.write_open_tag("hello", &[("hi", "its me"), ("no", "<escape>")])
.unwrap();
x.write_self_closing_tag("self-closing", &[("tag", "yay")])
.unwrap();
x.write_closing_tag("hello").unwrap();
assert_eq!(
std::str::from_utf8(&buf).unwrap(),
r##"<hello hi="its me" no="&lt;escape&gt;">
<self-closing tag="yay" />
</hello>
"##
);
}
#[test]
fn xml_escape() {
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("ab<>c&de") {
Cow::Owned(s) => assert_eq!(s, "ab&lt;&gt;c&amp;de".to_string(), "escape stuff"),
Cow::Borrowed(s) => panic!("s should be owned {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("") {
Cow::Borrowed(s) => assert_eq!(s, "", "empty escape is borrowed"),
Cow::Owned(s) => panic!("s should be borrowed {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!ŷbla") {
Cow::Borrowed(s) => assert_eq!(s, "hi!ŷbla", "no escape is borrowed"),
Cow::Owned(s) => panic!("s should be borrowed {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!<ŷ>bla") {
Cow::Owned(s) => assert_eq!(
s,
"hi!&lt;ŷ&gt;bla".to_string(),
"multi-byte chars are correctly used"
),
Cow::Borrowed(s) => panic!("s should be owned {}", s),
}
}
}