feat(corp/data-import): parse and import links

Change-Id: Iebdbc8f884f28064d7b00b8f8808b5030fa3d05c
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7864
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-01-18 18:23:16 +03:00 committed by tazjin
parent dc55ea3201
commit 476e312c06
2 changed files with 78 additions and 3 deletions

View file

@ -56,6 +56,17 @@ CREATE TABLE link_types (
name TEXT
) STRICT;
-- table for links between lemmata
CREATE TABLE links (
id INTEGER PRIMARY KEY,
link_type INTEGER NOT NULL,
from_lemma INTEGER NOT NULL,
to_lemma INTEGER NOT NULL,
FOREIGN KEY(link_type) REFERENCES link_types(id),
FOREIGN KEY(from_lemma) REFERENCES lemmas(id),
FOREIGN KEY(to_lemma) REFERENCES lemmas(id)
) STRICT;
"#,
)
.ensure("setting up initial table schema failed");
@ -92,6 +103,19 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
info!("inserted link type {}", lt.name);
}
OcElement::Link(link) => {
let mut stmt = conn
.prepare_cached(
"INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
)
.ensure("failed to prepare link statement");
stmt.execute((&link.id, &link.link_type, &link.from, &link.to))
.ensure("failed to insert link");
debug!("inserted link {}", link.id);
}
}
}

View file

@ -1,5 +1,5 @@
use super::{bail, Ensure};
use log::info;
use log::{info, warn};
use std::str::FromStr;
use xml::attribute::OwnedAttribute;
use xml::name::OwnedName;
@ -35,11 +35,20 @@ pub struct LinkType {
pub name: String,
}
#[derive(Debug, Default)]
pub struct Link {
pub id: u64, // link itself
pub from: u64, // lemma
pub to: u64, // lemma
pub link_type: u64,
}
#[derive(Debug)]
pub enum OcElement {
Grammeme(Grammeme),
Lemma(Lemma),
LinkType(LinkType),
Link(Link),
}
#[derive(Debug, PartialEq)]
@ -63,6 +72,9 @@ enum ParserState {
/// Parser is parsing link types.
LinkTypes,
/// Parser is parsing links.
Links,
/// Parser has seen the end of the line and nothing more is
/// available.
Ended,
@ -87,8 +99,8 @@ enum SectionState {
fn section_state(section: &str) -> SectionState {
match section {
"grammemes" | "lemmata" | "link_types" => SectionState::Active,
"restrictions" | "links" => SectionState::Inactive,
"grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
"restrictions" => SectionState::Inactive,
_ => SectionState::Unknown,
}
}
@ -148,6 +160,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
"grammemes" => ParserState::Grammemes,
"lemmata" => ParserState::Lemmata,
"link_types" => ParserState::LinkTypes,
"links" => ParserState::Links,
_ => unreachable!(),
};
}
@ -176,6 +189,10 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
}
ParserState::Links if name.local_name == "link" => {
return Some(OcElement::Link(self.parse_link(attributes)))
}
ParserState::Init | ParserState::Ended => bail(format!(
"parser received an unexpected start element while in state {:?}: {:?}",
self.state, event
@ -416,4 +433,38 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
link_type.name = self.parse_string("type");
link_type
}
fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
let mut link = Link::default();
for attr in attributes {
let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");
match attr.name.local_name.as_str() {
"id" => {
link.id = i_val();
}
"from" => {
link.from = i_val();
}
"to" => {
link.to = i_val();
}
"type" => {
link.link_type = i_val();
}
other => {
warn!("unexpected attribute {} on <link>", other);
continue;
}
}
}
// expect the end of the <link> element, though since these
// are empty it should be immediate.
self.skip_section("link");
link
}
}