feat(corp/data-import): add import of OR 'words_forms' table

This is the full morphological set table for all the words from the
lemmata table, which they don't call it that.

Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894
Tested-by: BuildkiteCI
Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
Vincent Ambo 2023-01-21 18:00:16 +03:00 committed by tazjin
parent 429c0d00c4
commit 8eeb5d3bcc
3 changed files with 69 additions and 6 deletions

View file

@ -6,7 +6,7 @@
//! introduce things like foreign key constraints between tables that //! introduce things like foreign key constraints between tables that
//! represent relations. //! represent relations.
use super::{bail, Ensure}; use super::Ensure;
use crate::oc_parser::*; use crate::oc_parser::*;
use crate::or_parser; use crate::or_parser;
use log::{debug, info}; use log::{debug, info};
@ -181,6 +181,16 @@ CREATE TABLE or_words (
word_type TEXT, word_type TEXT,
level TEXT level TEXT
) STRICT; ) STRICT;
CREATE TABLE or_words_forms (
id INTEGER PRIMARY KEY,
word_id INTEGER NOT NULL,
form_type TEXT,
position TEXT,
form TEXT,
form_bare TEXT,
FOREIGN KEY(word_id) REFERENCES words(id)
) STRICT;
"#, "#,
) )
.ensure("setting up OpenRussian table schema failed"); .ensure("setting up OpenRussian table schema failed");
@ -215,3 +225,30 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
info!("inserted {} OpenRussian words", count); info!("inserted {} OpenRussian words", count);
} }
pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
let mut stmt = conn
.prepare_cached(
"
INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
",
)
.ensure("failed to prepare OR word forms statement");
let mut count = 0;
for form in forms {
stmt.execute((
form.id,
form.word_id,
form.form_type,
form.position,
form.form,
form.form_bare,
))
.ensure("failed to insert OR word form");
count += 1;
}
info!("inserted {} OpenRussian word forms", count);
}

View file

@ -210,12 +210,23 @@ fn open_russian(conn: &Connection, args: &Args) {
db_setup::initial_or_schema(conn); db_setup::initial_or_schema(conn);
let tx = conn {
.unchecked_transaction() let tx = conn
.ensure("failed to start transaction"); .unchecked_transaction()
.ensure("failed to start transaction");
db_setup::insert_or_words(&tx, parser.words()); db_setup::insert_or_words(&tx, parser.words());
tx.commit().ensure("OpenRussian words commit failed"); tx.commit().ensure("OpenRussian words commit failed");
}
{
let tx = conn
.unchecked_transaction()
.ensure("failed to start transaction");
db_setup::insert_or_word_forms(&tx, parser.words_forms());
tx.commit().ensure("OpenRussian word forms commit failed");
}
info!("finished OpenRussian import"); info!("finished OpenRussian import");
} }

View file

@ -33,6 +33,17 @@ pub struct Word {
pub created_at: String, // TODO: unknown pub created_at: String, // TODO: unknown
} }
/// A word form from the `words_forms` table.
#[derive(Debug, Deserialize)]
pub struct WordForm {
pub id: usize,
pub word_id: usize,
pub form_type: String,
pub position: String,
pub form: String,
pub form_bare: String,
}
pub struct OpenRussianParser { pub struct OpenRussianParser {
or_directory: PathBuf, or_directory: PathBuf,
} }
@ -50,6 +61,10 @@ impl OpenRussianParser {
self.parser_for("words.csv") self.parser_for("words.csv")
} }
pub fn words_forms(&self) -> DynIter<WordForm> {
self.parser_for("words_forms.csv")
}
fn parser_for<T: serde::de::DeserializeOwned + 'static>( fn parser_for<T: serde::de::DeserializeOwned + 'static>(
&self, &self,
file_name: &str, file_name: &str,