feat(corp/data-import): parse and import link types

Change-Id: Iae01d1dc6894117dc693b4690d8bc79861212ae6
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7863
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
Vincent Ambo 2023-01-18 15:48:33 +03:00 committed by tazjin
parent 3f0b1d8e0b
commit dc55ea3201
2 changed files with 54 additions and 2 deletions

View file

@ -50,6 +50,12 @@ CREATE TABLE word_grammemes (
FOREIGN KEY(word) REFERENCES words(ROWID) FOREIGN KEY(word) REFERENCES words(ROWID)
) STRICT; ) STRICT;
-- table for link types
CREATE TABLE link_types (
id INTEGER PRIMARY KEY,
name TEXT
) STRICT;
"#, "#,
) )
.ensure("setting up initial table schema failed"); .ensure("setting up initial table schema failed");
@ -76,6 +82,16 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
} }
OcElement::Lemma(lemma) => insert_lemma(conn, lemma), OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
OcElement::LinkType(lt) => {
conn.execute(
"INSERT INTO link_types (id, name) VALUES (?1, ?2)",
(&lt.id, &lt.name),
)
.ensure("failed to insert link type");
info!("inserted link type {}", lt.name);
}
} }
} }

View file

@ -29,10 +29,17 @@ pub struct Lemma {
pub variations: Vec<Variation>, pub variations: Vec<Variation>,
} }
#[derive(Debug, Default)]
pub struct LinkType {
pub id: u64,
pub name: String,
}
#[derive(Debug)] #[derive(Debug)]
pub enum OcElement { pub enum OcElement {
Grammeme(Grammeme), Grammeme(Grammeme),
Lemma(Lemma), Lemma(Lemma),
LinkType(LinkType),
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -53,6 +60,9 @@ enum ParserState {
/// Parser is parsing a morphological variation of a lemma. /// Parser is parsing a morphological variation of a lemma.
Variation, Variation,
/// Parser is parsing link types.
LinkTypes,
/// Parser has seen the end of the line and nothing more is /// Parser has seen the end of the line and nothing more is
/// available. /// available.
Ended, Ended,
@ -77,8 +87,8 @@ enum SectionState {
fn section_state(section: &str) -> SectionState { fn section_state(section: &str) -> SectionState {
match section { match section {
"grammemes" | "lemmata" => SectionState::Active, "grammemes" | "lemmata" | "link_types" => SectionState::Active,
"restrictions" | "link_types" | "links" => SectionState::Inactive, "restrictions" | "links" => SectionState::Inactive,
_ => SectionState::Unknown, _ => SectionState::Unknown,
} }
} }
@ -137,6 +147,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
self.state = match name.local_name.as_str() { self.state = match name.local_name.as_str() {
"grammemes" => ParserState::Grammemes, "grammemes" => ParserState::Grammemes,
"lemmata" => ParserState::Lemmata, "lemmata" => ParserState::Lemmata,
"link_types" => ParserState::LinkTypes,
_ => unreachable!(), _ => unreachable!(),
}; };
} }
@ -156,10 +167,15 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
ParserState::Grammemes => { ParserState::Grammemes => {
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes))) return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
} }
ParserState::Lemmata => { ParserState::Lemmata => {
return Some(OcElement::Lemma(self.parse_lemma(name, attributes))) return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
} }
ParserState::LinkTypes => {
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
}
ParserState::Init | ParserState::Ended => bail(format!( ParserState::Init | ParserState::Ended => bail(format!(
"parser received an unexpected start element while in state {:?}: {:?}", "parser received an unexpected start element while in state {:?}: {:?}",
self.state, event self.state, event
@ -380,4 +396,24 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
} }
} }
} }
fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
if name.local_name != "type" {
bail(format!(
"expected to parse a link type, but found <{}>",
name.local_name
));
}
let mut link_type = LinkType::default();
for attr in attributes {
if attr.name.local_name == "id" {
link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
}
}
link_type.name = self.parse_string("type");
link_type
}
} }