refactor(tvix/eval): rewrite xml emitter to be simple-stupid

In order to be compatible with the nix XML generator, it’s easier to
generate the XML directly, instead of going through a library which we
have to bend to do what we need.

Removes dependency on `xml-rs`, which came with a full XML parser that
we didn’t use. Only takes a tiny bit of code for the XML escaping,
somewhat simplified.

I add a little escaping value, to make sure we have the same behaviour
as nix proper.

Interestingly enough, we never need to escape XML attribute names,
because the `builtins.toXML` format encodes user-defined values as
attribute keys only. So we only escape attribute values.

Fixes: https://b.tvl.fyi/issues/399
Change-Id: If4d407d324864b3bb9aa3160e2ec6889f7727127
Reviewed-on: https://cl.tvl.fyi/c/depot/+/11697
Tested-by: BuildkiteCI
Reviewed-by: flokli <flokli@flokli.de>
Autosubmit: Profpatsch <mail@profpatsch.de>
This commit is contained in:
Profpatsch 2024-05-20 15:50:33 +02:00 committed by clbot
parent e7be342256
commit 5b2ba0efa1
9 changed files with 205 additions and 89 deletions

7
tvix/Cargo.lock generated
View file

@ -4167,7 +4167,6 @@ dependencies = [
"test-strategy",
"toml",
"tvix-eval-builtin-macros",
"xml-rs",
]
[[package]]
@ -4844,12 +4843,6 @@ dependencies = [
"rustix",
]
[[package]]
name = "xml-rs"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fcb9cbac069e033553e8bb871be2fbdffcab578eb25bd0f7c508cedc6dcd75a"
[[package]]
name = "xz2"
version = "0.1.7"

View file

@ -13222,10 +13222,6 @@ rec {
packageId = "tvix-eval-builtin-macros";
rename = "builtin-macros";
}
{
name = "xml-rs";
packageId = "xml-rs";
}
];
devDependencies = [
{
@ -16041,18 +16037,6 @@ rec {
};
resolvedDefaultFeatures = [ "default" "unsupported" ];
};
"xml-rs" = rec {
crateName = "xml-rs";
version = "0.8.19";
edition = "2021";
crateBin = [ ];
sha256 = "0nnpvk3fv32hgh7vs9gbg2swmzxx5yz73f4b7rak7q39q2x9rjqg";
libName = "xml";
authors = [
"Vladimir Matveev <vmatveev@citrine.cc>"
];
};
"xz2" = rec {
crateName = "xz2";
version = "0.1.7";

View file

@ -30,7 +30,6 @@ smol_str = "0.2.0"
tabwriter = "1.2"
test-strategy = { version = "0.2.1", optional = true }
toml = "0.6.0"
xml-rs = "0.8.4"
sha2 = "0.10.8"
sha1 = "0.10.6"
md-5 = "0.10.6"

View file

@ -3,59 +3,43 @@
//! things in nixpkgs rely on.
use bstr::ByteSlice;
use std::borrow::Cow;
use std::{io::Write, rc::Rc};
use xml::writer::events::XmlEvent;
use xml::writer::EmitterConfig;
use xml::writer::EventWriter;
use crate::{ErrorKind, Value};
/// Recursively serialise a value to XML. The value *must* have been
/// deep-forced before being passed to this function.
pub fn value_to_xml<W: Write>(mut writer: W, value: &Value) -> Result<(), ErrorKind> {
let config = EmitterConfig {
perform_indent: true,
pad_self_closing: true,
// Nix uses single-quotes *only* in the document declaration,
// so we need to write it manually.
write_document_declaration: false,
..Default::default()
};
// Write a literal document declaration, using C++-Nix-style
// single quotes.
writeln!(writer, "<?xml version='1.0' encoding='utf-8'?>")?;
let mut writer = EventWriter::new_with_config(writer, config);
let mut emitter = XmlEmitter::new(writer);
writer.write(XmlEvent::start_element("expr"))?;
value_variant_to_xml(&mut writer, value)?;
writer.write(XmlEvent::end_element())?;
// Unwrap the writer to add the final newline that C++ Nix adds.
writeln!(writer.into_inner())?;
emitter.write_open_tag("expr", &[])?;
value_variant_to_xml(&mut emitter, value)?;
emitter.write_closing_tag("expr")?;
Ok(())
}
fn write_typed_value<W: Write, V: ToString>(
w: &mut EventWriter<W>,
name: &str,
w: &mut XmlEmitter<W>,
name_unescaped: &str,
value: V,
) -> Result<(), ErrorKind> {
w.write(XmlEvent::start_element(name).attr("value", &value.to_string()))?;
w.write(XmlEvent::end_element())?;
w.write_self_closing_tag(name_unescaped, &[("value", &value.to_string())])?;
Ok(())
}
fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Result<(), ErrorKind> {
fn value_variant_to_xml<W: Write>(w: &mut XmlEmitter<W>, value: &Value) -> Result<(), ErrorKind> {
match value {
Value::Thunk(t) => return value_variant_to_xml(w, &t.value()),
Value::Null => {
w.write(XmlEvent::start_element("null"))?;
w.write(XmlEvent::end_element())
w.write_open_tag("null", &[])?;
w.write_closing_tag("null")?;
}
Value::Bool(b) => return write_typed_value(w, "bool", b),
@ -65,50 +49,46 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
Value::Path(p) => return write_typed_value(w, "path", p.to_string_lossy()),
Value::List(list) => {
w.write(XmlEvent::start_element("list"))?;
w.write_open_tag("list", &[])?;
for elem in list.into_iter() {
value_variant_to_xml(w, elem)?;
}
w.write(XmlEvent::end_element())
w.write_closing_tag("list")?;
}
Value::Attrs(attrs) => {
w.write(XmlEvent::start_element("attrs"))?;
w.write_open_tag("attrs", &[])?;
for elem in attrs.iter() {
w.write(XmlEvent::start_element("attr").attr("name", &elem.0.to_str_lossy()))?;
w.write_open_tag("attr", &[("name", &elem.0.to_str_lossy())])?;
value_variant_to_xml(w, elem.1)?;
w.write(XmlEvent::end_element())?;
w.write_closing_tag("attr")?;
}
w.write(XmlEvent::end_element())
w.write_closing_tag("attrs")?;
}
Value::Closure(c) => {
w.write(XmlEvent::start_element("function"))?;
w.write_open_tag("function", &[])?;
match &c.lambda.formals {
Some(formals) => {
let mut attrspat = XmlEvent::start_element("attrspat");
let mut attrs: Vec<(&str, &str)> = Vec::with_capacity(2);
if formals.ellipsis {
attrspat = attrspat.attr("ellipsis", "1");
attrs.push(("ellipsis", "1"));
}
if let Some(ref name) = &formals.name {
attrspat = attrspat.attr("name", name.as_str());
attrs.push(("name", name.as_str()));
}
w.write(attrspat)?;
w.write_open_tag("attrspat", &attrs)?;
for arg in formals.arguments.iter() {
w.write(
XmlEvent::start_element("attr").attr("name", &arg.0.to_str_lossy()),
)?;
w.write(XmlEvent::end_element())?;
w.write_self_closing_tag("attr", &[("name", &arg.0.to_str_lossy())])?;
}
w.write(XmlEvent::end_element())?;
w.write_closing_tag("attrspat")?;
}
None => {
// TODO(tazjin): tvix does not currently persist function
@ -120,17 +100,16 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
// If we don't want to persist the data, we can re-parse the
// AST from the spans of the lambda's bytecode and figure it
// out that way, but it needs some investigating.
w.write(XmlEvent::start_element("varpat").attr("name", /* fake: */ "x"))?;
w.write(XmlEvent::end_element())?;
w.write_self_closing_tag("varpat", &[("name", /* fake: */ "x")])?;
}
}
w.write(XmlEvent::end_element())
w.write_closing_tag("function")?;
}
Value::Builtin(_) => {
w.write(XmlEvent::start_element("unevaluated"))?;
w.write(XmlEvent::end_element())
w.write_open_tag("unevaluated", &[])?;
w.write_closing_tag("unevaluated")?;
}
Value::AttrNotFound
@ -148,7 +127,174 @@ fn value_variant_to_xml<W: Write>(w: &mut EventWriter<W>, value: &Value) -> Resu
Value::Catchable(_) => {
panic!("tvix bug: value_to_xml() called on a value which had not been deep-forced")
}
}?;
};
Ok(())
}
/// A simple-stupid XML emitter, which implements only the subset needed for byte-by-byte compat with C++ nix `builtins.toXML`.
struct XmlEmitter<W> {
/// The current indentation
cur_indent: usize,
writer: W,
}
impl<W: Write> XmlEmitter<W> {
pub fn new(writer: W) -> Self {
XmlEmitter {
cur_indent: 0,
writer,
}
}
/// Write an open tag with the given name (which is not escaped!)
/// and attributes (Keys are not escaped! Only attribute values are.)
pub fn write_open_tag(
&mut self,
name_unescaped: &str,
attrs: &[(&str, &str)],
) -> std::io::Result<()> {
self.add_indent()?;
self.writer.write_all(b"<")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.write_attrs_escape_vals(attrs)?;
self.writer.write_all(b">\n")?;
self.cur_indent += 2;
Ok(())
}
/// Write a self-closing open tag with the given name (which is not escaped!)
/// and attributes (Keys are not escaped! Only attribute values are.)
pub fn write_self_closing_tag(
&mut self,
name_unescaped: &str,
attrs: &[(&str, &str)],
) -> std::io::Result<()> {
self.add_indent()?;
self.writer.write_all(b"<")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.write_attrs_escape_vals(attrs)?;
self.writer.write_all(b" />\n")?;
Ok(())
}
/// Write a closing tag with the given name (which is not escaped!)
pub fn write_closing_tag(&mut self, name_unescaped: &str) -> std::io::Result<()> {
self.cur_indent -= 2;
self.add_indent()?;
self.writer.write_all(b"</")?;
self.writer.write_all(name_unescaped.as_bytes())?;
self.writer.write_all(b">\n")?;
Ok(())
}
#[inline]
fn add_indent(&mut self) -> std::io::Result<()> {
self.writer.write_all(&b" ".repeat(self.cur_indent))
}
/// Write an attribute list
fn write_attrs_escape_vals(&mut self, attrs: &[(&str, &str)]) -> std::io::Result<()> {
for (name, val) in attrs {
self.writer.write_all(b" ")?;
self.writer.write_all(name.as_bytes())?;
self.writer.write_all(br#"=""#)?;
self.writer
.write_all(Self::escape_attr_value(val).as_bytes())?;
self.writer.write_all(b"\"")?;
}
Ok(())
}
/// Escape the given attribute value, making sure we only actually clone the string if we needed to replace something.
fn escape_attr_value(s: &str) -> Cow<str> {
let mut last_escape: usize = 0;
let mut res: Cow<str> = Cow::Borrowed("");
// iterating via char_indices gives us the ability to index the original string slice at character boundaries
for (idx, c) in s.char_indices() {
match Self::should_escape_char(c) {
None => {}
Some(new) => {
// add characters since the last escape we did
res += &s[last_escape..idx];
// add the escaped value
res += new;
last_escape = idx + 1;
}
}
}
// we did not need to escape anything, so borrow original string
if last_escape == 0 {
Cow::Borrowed(s)
} else {
// add the remaining characters
res += &s[last_escape..];
res
}
}
fn should_escape_char(c: char) -> Option<&'static str> {
match c {
'<' => Some("&lt;"),
'>' => Some("&gt;"),
'"' => Some("&quot;"),
'\'' => Some("&apos;"),
'&' => Some("&amp;"),
'\n' => Some("&#xA;"),
'\r' => Some("&#xD;"),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use bytes::buf::Writer;
use pretty_assertions::assert_eq;
use crate::builtins::to_xml::XmlEmitter;
use std::borrow::Cow;
#[test]
fn xml_gen() {
let mut buf = Vec::new();
let mut x = XmlEmitter::new(&mut buf);
x.write_open_tag("hello", &[("hi", "its me"), ("no", "<escape>")])
.unwrap();
x.write_self_closing_tag("self-closing", &[("tag", "yay")])
.unwrap();
x.write_closing_tag("hello").unwrap();
assert_eq!(
std::str::from_utf8(&buf).unwrap(),
r##"<hello hi="its me" no="&lt;escape&gt;">
<self-closing tag="yay" />
</hello>
"##
);
}
#[test]
fn xml_escape() {
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("ab<>c&de") {
Cow::Owned(s) => assert_eq!(s, "ab&lt;&gt;c&amp;de".to_string(), "escape stuff"),
Cow::Borrowed(s) => panic!("s should be owned {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("") {
Cow::Borrowed(s) => assert_eq!(s, "", "empty escape is borrowed"),
Cow::Owned(s) => panic!("s should be borrowed {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!ŷbla") {
Cow::Borrowed(s) => assert_eq!(s, "hi!ŷbla", "no escape is borrowed"),
Cow::Owned(s) => panic!("s should be borrowed {}", s),
}
match XmlEmitter::<Writer<Vec<u8>>>::escape_attr_value("hi!<ŷ>bla") {
Cow::Owned(s) => assert_eq!(
s,
"hi!&lt;ŷ&gt;bla".to_string(),
"multi-byte chars are correctly used"
),
Cow::Borrowed(s) => panic!("s should be owned {}", s),
}
}
}

View file

@ -10,7 +10,6 @@ use std::{fmt::Debug, fmt::Display, num::ParseIntError};
use codemap::{File, Span};
use codemap_diagnostic::{ColorConfig, Diagnostic, Emitter, Level, SpanLabel, SpanStyle};
use smol_str::SmolStr;
use xml::writer::Error as XmlError;
use crate::spans::ToSpan;
use crate::value::{CoercionKind, NixString};
@ -194,9 +193,6 @@ pub enum ErrorKind {
/// Invalid UTF-8 was encoutered somewhere
Utf8,
/// Errors while serialising to XML.
Xml(Rc<XmlError>),
/// Variant for errors that bubble up to eval from other Tvix
/// components.
TvixError(Rc<dyn error::Error>),
@ -248,7 +244,6 @@ impl error::Error for Error {
errors.first().map(|e| e as &dyn error::Error)
}
ErrorKind::IO { error, .. } => Some(error.as_ref()),
ErrorKind::Xml(error) => Some(error.as_ref()),
ErrorKind::TvixError(error) => Some(error.as_ref()),
_ => None,
}
@ -285,12 +280,6 @@ impl From<bstr::FromUtf8Error> for ErrorKind {
}
}
impl From<XmlError> for ErrorKind {
fn from(err: XmlError) -> Self {
Self::Xml(Rc::new(err))
}
}
impl From<io::Error> for ErrorKind {
fn from(e: io::Error) -> Self {
ErrorKind::IO {
@ -506,8 +495,6 @@ to a missing value in the attribute set(s) included via `with`."#,
write!(f, "Invalid UTF-8 in string")
}
ErrorKind::Xml(error) => write!(f, "failed to serialise to XML: {error}"),
ErrorKind::TvixError(inner_error) => {
write!(f, "{inner_error}")
}
@ -823,7 +810,6 @@ impl Error {
| ErrorKind::JsonError(_)
| ErrorKind::NotSerialisableToJson(_)
| ErrorKind::FromTomlError(_)
| ErrorKind::Xml(_)
| ErrorKind::Utf8
| ErrorKind::TvixError(_)
| ErrorKind::TvixBug { .. }
@ -870,7 +856,6 @@ impl Error {
ErrorKind::UnexpectedArgument { .. } => "E031",
ErrorKind::RelativePathResolution(_) => "E032",
ErrorKind::DivisionByZero => "E033",
ErrorKind::Xml(_) => "E034",
ErrorKind::FromTomlError(_) => "E035",
ErrorKind::NotSerialisableToJson(_) => "E036",
ErrorKind::UnexpectedContext => "E037",

View file

@ -0,0 +1,5 @@
<?xml version='1.0' encoding='utf-8'?>
<expr>
<attrs>
</attrs>
</expr>

View file

@ -0,0 +1 @@
{ }

View file

@ -0,0 +1 @@
"<?xml version='1.0' encoding='utf-8'?>\n<expr>\n <attrs>\n <attr name=\"&amp;-{\">\n <string value=\";&amp;&quot;\" />\n </attr>\n <attr name=\"a\">\n <string value=\"s\" />\n </attr>\n </attrs>\n</expr>\n"

View file

@ -0,0 +1,2 @@
# Check some corner cases regarding escaping.
builtins.toXML { a = "s"; "&-{" = ";&\""; }