feat(corp/data-import): parse and import link types
Change-Id: Iae01d1dc6894117dc693b4690d8bc79861212ae6 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7863 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
parent
3f0b1d8e0b
commit
dc55ea3201
2 changed files with 54 additions and 2 deletions
|
@ -50,6 +50,12 @@ CREATE TABLE word_grammemes (
|
||||||
FOREIGN KEY(word) REFERENCES words(ROWID)
|
FOREIGN KEY(word) REFERENCES words(ROWID)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for link types
|
||||||
|
CREATE TABLE link_types (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.ensure("setting up initial table schema failed");
|
.ensure("setting up initial table schema failed");
|
||||||
|
@ -76,6 +82,16 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
}
|
}
|
||||||
|
|
||||||
OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
|
OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
|
||||||
|
|
||||||
|
OcElement::LinkType(lt) => {
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO link_types (id, name) VALUES (?1, ?2)",
|
||||||
|
(<.id, <.name),
|
||||||
|
)
|
||||||
|
.ensure("failed to insert link type");
|
||||||
|
|
||||||
|
info!("inserted link type {}", lt.name);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,10 +29,17 @@ pub struct Lemma {
|
||||||
pub variations: Vec<Variation>,
|
pub variations: Vec<Variation>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct LinkType {
|
||||||
|
pub id: u64,
|
||||||
|
pub name: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub enum OcElement {
|
pub enum OcElement {
|
||||||
Grammeme(Grammeme),
|
Grammeme(Grammeme),
|
||||||
Lemma(Lemma),
|
Lemma(Lemma),
|
||||||
|
LinkType(LinkType),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
|
@ -53,6 +60,9 @@ enum ParserState {
|
||||||
/// Parser is parsing a morphological variation of a lemma.
|
/// Parser is parsing a morphological variation of a lemma.
|
||||||
Variation,
|
Variation,
|
||||||
|
|
||||||
|
/// Parser is parsing link types.
|
||||||
|
LinkTypes,
|
||||||
|
|
||||||
/// Parser has seen the end of the line and nothing more is
|
/// Parser has seen the end of the line and nothing more is
|
||||||
/// available.
|
/// available.
|
||||||
Ended,
|
Ended,
|
||||||
|
@ -77,8 +87,8 @@ enum SectionState {
|
||||||
|
|
||||||
fn section_state(section: &str) -> SectionState {
|
fn section_state(section: &str) -> SectionState {
|
||||||
match section {
|
match section {
|
||||||
"grammemes" | "lemmata" => SectionState::Active,
|
"grammemes" | "lemmata" | "link_types" => SectionState::Active,
|
||||||
"restrictions" | "link_types" | "links" => SectionState::Inactive,
|
"restrictions" | "links" => SectionState::Inactive,
|
||||||
_ => SectionState::Unknown,
|
_ => SectionState::Unknown,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -137,6 +147,7 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
self.state = match name.local_name.as_str() {
|
self.state = match name.local_name.as_str() {
|
||||||
"grammemes" => ParserState::Grammemes,
|
"grammemes" => ParserState::Grammemes,
|
||||||
"lemmata" => ParserState::Lemmata,
|
"lemmata" => ParserState::Lemmata,
|
||||||
|
"link_types" => ParserState::LinkTypes,
|
||||||
_ => unreachable!(),
|
_ => unreachable!(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -156,10 +167,15 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
ParserState::Grammemes => {
|
ParserState::Grammemes => {
|
||||||
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
|
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
|
||||||
}
|
}
|
||||||
|
|
||||||
ParserState::Lemmata => {
|
ParserState::Lemmata => {
|
||||||
return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
|
return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ParserState::LinkTypes => {
|
||||||
|
return Some(OcElement::LinkType(self.parse_link_type(name, attributes)))
|
||||||
|
}
|
||||||
|
|
||||||
ParserState::Init | ParserState::Ended => bail(format!(
|
ParserState::Init | ParserState::Ended => bail(format!(
|
||||||
"parser received an unexpected start element while in state {:?}: {:?}",
|
"parser received an unexpected start element while in state {:?}: {:?}",
|
||||||
self.state, event
|
self.state, event
|
||||||
|
@ -380,4 +396,24 @@ impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_link_type(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> LinkType {
|
||||||
|
if name.local_name != "type" {
|
||||||
|
bail(format!(
|
||||||
|
"expected to parse a link type, but found <{}>",
|
||||||
|
name.local_name
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut link_type = LinkType::default();
|
||||||
|
|
||||||
|
for attr in attributes {
|
||||||
|
if attr.name.local_name == "id" {
|
||||||
|
link_type.id = u64::from_str(&attr.value).ensure("failed to parse link type ID");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
link_type.name = self.parse_string("type");
|
||||||
|
link_type
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue