feat(corp/data-import): parse and import links
Change-Id: Iebdbc8f884f28064d7b00b8f8808b5030fa3d05c Reviewed-on: https://cl.tvl.fyi/c/depot/+/7864 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
dc55ea3201
commit
476e312c06
2 changed files with 78 additions and 3 deletions
|
@ -56,6 +56,17 @@ CREATE TABLE link_types (
|
||||||
name TEXT
|
name TEXT
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for links between lemmata
|
||||||
|
CREATE TABLE links (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
link_type INTEGER NOT NULL,
|
||||||
|
from_lemma INTEGER NOT NULL,
|
||||||
|
to_lemma INTEGER NOT NULL,
|
||||||
|
FOREIGN KEY(link_type) REFERENCES link_types(id),
|
||||||
|
FOREIGN KEY(from_lemma) REFERENCES lemmas(id),
|
||||||
|
FOREIGN KEY(to_lemma) REFERENCES lemmas(id)
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.ensure("setting up initial table schema failed");
|
.ensure("setting up initial table schema failed");
|
||||||
|
@ -92,6 +103,19 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
|
|
||||||
info!("inserted link type {}", lt.name);
|
info!("inserted link type {}", lt.name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
OcElement::Link(link) => {
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached(
|
||||||
|
"INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
|
||||||
|
)
|
||||||
|
.ensure("failed to prepare link statement");
|
||||||
|
|
||||||
|
stmt.execute((&link.id, &link.link_type, &link.from, &link.to))
|
||||||
|
.ensure("failed to insert link");
|
||||||
|
|
||||||
|
debug!("inserted link {}", link.id);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
use super::{bail, Ensure};
|
use super::{bail, Ensure};
|
||||||
use log::info;
|
use log::{info, warn};
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use xml::attribute::OwnedAttribute;
|
use xml::attribute::OwnedAttribute;
|
||||||
use xml::name::OwnedName;
|
use xml::name::OwnedName;
|
||||||
|
@ -35,11 +35,20 @@ pub struct LinkType {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct Link {
|
||||||
|
pub id: u64, // link itself
|
||||||
|
pub from: u64, // lemma
|
||||||
|
pub to: u64, // lemma
|
||||||
|
pub link_type: u64,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum OcElement {
|
pub enum OcElement {
|
||||||
Grammeme(Grammeme),
|
Grammeme(Grammeme),
|
||||||
Lemma(Lemma),
|
Lemma(Lemma),
|
||||||
LinkType(LinkType),
|
LinkType(LinkType),
|
||||||
|
Link(Link),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
|
@ -63,6 +72,9 @@ enum ParserState {
|
||||||
/// Parser is parsing link types.
|
/// Parser is parsing link types.
|
||||||
LinkTypes,
|
LinkTypes,
|
||||||
|
|
||||||
|
/// Parser is parsing links.
|
||||||
|
Links,
|
||||||
|
|
||||||
/// Parser has seen the end of the line and nothing more is
|
/// Parser has seen the end of the line and nothing more is
|
||||||
/// available.
|
/// available.
|
||||||
Ended,
|
Ended,
|
||||||
|
@ -87,8 +99,8 @@ enum SectionState {
|
||||||
|
|
||||||
fn section_state(section: &str) -> SectionState {
|
fn section_state(section: &str) -> SectionState {
|
||||||
match section {
|
match section {
|
||||||
"grammemes" | "lemmata" | "link_types" => SectionState::Active,
|
"grammemes" | "lemmata" | "link_types" | "links" => SectionState::Active,
|
||||||
"restrictions" | "links" => SectionState::Inactive,
|
"restrictions" => SectionState::Inactive,
|
||||||
_ => SectionState::Unknown,
|
_ => SectionState::Unknown,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -148,6 +160,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
"grammemes" => ParserState::Grammemes,
|
"grammemes" => ParserState::Grammemes,
|
||||||
"lemmata" => ParserState::Lemmata,
|
"lemmata" => ParserState::Lemmata,
|
||||||
"link_types" => ParserState::LinkTypes,
|
"link_types" => ParserState::LinkTypes,
|
||||||
|
"links" => ParserState::Links,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -176,6 +189,10 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
|
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ParserState::Links if name.local_name == "link" => {
|
||||||
|
return Some(OcElement::Link(self.parse_link(attributes)))
|
||||||
|
}
|
||||||
|
|
||||||
ParserState::Init | ParserState::Ended => bail(format!(
|
ParserState::Init | ParserState::Ended => bail(format!(
|
||||||
"parser received an unexpected start element while in state {:?}: {:?}",
|
"parser received an unexpected start element while in state {:?}: {:?}",
|
||||||
self.state, event
|
self.state, event
|
||||||
|
@ -416,4 +433,38 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
link_type.name = self.parse_string("type");
|
link_type.name = self.parse_string("type");
|
||||||
link_type
|
link_type
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_link(&mut self, attributes: &[OwnedAttribute]) -> Link {
|
||||||
|
let mut link = Link::default();
|
||||||
|
|
||||||
|
for attr in attributes {
|
||||||
|
let i_val = || u64::from_str(&attr.value).ensure("failed to parse link field");
|
||||||
|
|
||||||
|
match attr.name.local_name.as_str() {
|
||||||
|
"id" => {
|
||||||
|
link.id = i_val();
|
||||||
|
}
|
||||||
|
"from" => {
|
||||||
|
link.from = i_val();
|
||||||
|
}
|
||||||
|
"to" => {
|
||||||
|
link.to = i_val();
|
||||||
|
}
|
||||||
|
"type" => {
|
||||||
|
link.link_type = i_val();
|
||||||
|
}
|
||||||
|
|
||||||
|
other => {
|
||||||
|
warn!("unexpected attribute {} on <link>", other);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// expect the end of the <link> element, though since these
|
||||||
|
// are empty it should be immediate.
|
||||||
|
self.skip_section("link");
|
||||||
|
|
||||||
|
link
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue