feat(corp/data-import): parse lemmas from OpenCorpora dump

Change-Id: I1e4efcfc8e555f61578b563411d5e6ed9590d8e8
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7860
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-01-18 03:22:53 +03:00 committed by tazjin
parent ee7616d956
commit 485c3cc912
2 changed files with 135 additions and 14 deletions

View file

@ -80,11 +80,11 @@ fn main() {
let mut out = BufWriter::new(std::io::stdout().lock());
while let Some(elem) = parser.next_element() {
match elem {
oc_parser::OcElement::Grammeme(g) => {
writeln!(out, "{:?}", g).ensure("writing element failed")
if let oc_parser::OcElement::Lemma(lemma) = elem {
if lemma.lemma.word == "тяжёлый" {
writeln!(out, "{:?}", lemma).ensure("writing output failed");
break;
}
oc_parser::OcElement::Lemma(_) => continue,
}
}

View file

@ -1,5 +1,6 @@
use super::{bail, Ensure};
use log::info;
use std::str::FromStr;
use xml::attribute::OwnedAttribute;
use xml::name::OwnedName;
use xml::reader::XmlEvent;
@ -7,14 +8,26 @@ use xml::EventReader;
#[derive(Default, Debug)]
pub struct Grammeme {
parent: Option<String>,
name: String,
alias: String,
description: String,
pub parent: Option<String>,
pub name: String,
pub alias: String,
pub description: String,
}
#[derive(Debug)]
pub struct Lemma {}
/// Single form of a word (either its lemma, or the variations).
#[derive(Debug, Default)]
pub struct Variation {
pub word: String,
pub grammemes: Vec<String>,
}
#[derive(Debug, Default)]
pub struct Lemma {
pub id: u64,
pub lemma: Variation,
pub grammemes: Vec<String>,
pub variations: Vec<Variation>,
}
#[derive(Debug)]
pub enum OcElement {
@ -34,6 +47,12 @@ enum ParserState {
/// Parser is parsing lemmata.
Lemmata,
/// Parser is inside a lemma's actual lemma.
Lemma,
/// Parser is parsing a morphological variation of a lemma.
Variation,
/// Parser has seen the end of the line and nothing more is
/// available.
Ended,
@ -133,7 +152,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
// actual beginning of an actual element, dispatch accordingly
event @ XmlEvent::StartElement {
name, attributes, ..
} => match self.state {
} => match &self.state {
ParserState::Grammemes => {
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
}
@ -145,6 +164,11 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
"parser received an unexpected start element while in state {:?}: {:?}",
self.state, event
)),
other => bail(format!(
"next_element() called while parser was in state {:?}",
other
)),
},
// finally, events that indicate a bug if they're
@ -199,6 +223,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
}
}
/// Parse a single `<grammeme>` tag.
fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
if name.local_name != "grammeme" {
bail(format!(
@ -247,7 +272,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
grammeme
}
fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma {
fn parse_lemma(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Lemma {
if name.local_name != "lemma" {
bail(format!(
"expected to parse a lemma, but found <{}>",
@ -255,8 +280,104 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
));
}
self.skip_section("lemma");
self.state = ParserState::Lemma;
let mut lemma = Lemma::default();
Lemma {}
for attr in attributes {
if attr.name.local_name == "id" {
lemma.id = u64::from_str(&attr.value).ensure("failed to parse lemma ID");
}
}
loop {
match self.next() {
// <lemma> has ended
XmlEvent::EndElement { name } if name.local_name == "lemma" => {
self.state = ParserState::Lemmata;
return lemma;
}
// actual lemma content
XmlEvent::StartElement {
name, attributes, ..
} => {
match name.local_name.as_str() {
// beginning to parse the lemma itself
"l" => {
lemma.lemma.word = attributes
.into_iter()
.find(|attr| attr.name.local_name == "t")
.map(|attr| attr.value)
.ensure(format!("lemma {} had no actual word", lemma.id));
}
// parsing a lemma variation
"f" => {
self.state = ParserState::Variation;
let word = attributes
.into_iter()
.find(|attr| attr.name.local_name == "t")
.map(|attr| attr.value)
.ensure(format!(
"variation of lemma {} had no actual word",
lemma.id
));
lemma.variations.push(Variation {
word,
grammemes: vec![],
});
}
// parse a grammeme association
"g" => {
let grammeme = attributes
.into_iter()
.find(|attr| attr.name.local_name == "v")
.map(|attr| attr.value)
.ensure(format!(
"grammeme association in lemma {} missing ID",
lemma.id
));
match self.state {
ParserState::Lemma => {
lemma.grammemes.push(grammeme);
}
ParserState::Variation => {
lemma
.variations
.last_mut()
.ensure("variations should be non-empty")
.grammemes
.push(grammeme);
}
_ => bail(format!("invalid parser state: encountered grammeme association while in {:?}", self.state)),
}
}
other => bail(format!("unexpected element while parsing lemma: {other}")),
};
}
XmlEvent::EndElement { name } => match name.local_name.as_str() {
"l" if self.state == ParserState::Lemma => continue,
"f" if self.state == ParserState::Variation => {
self.state = ParserState::Lemma;
continue;
}
"g" => continue,
other => bail(format!(
"unexpected </{other}> while parsing lemma {}",
lemma.id
)),
},
_ => continue,
}
}
}
}