feat(corp/data-import): add import of OR 'words_forms' table
This is the full morphological set table for all the words from the lemmata table, which they don't call it that. Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
parent
429c0d00c4
commit
8eeb5d3bcc
3 changed files with 69 additions and 6 deletions
|
@ -6,7 +6,7 @@
|
|||
//! introduce things like foreign key constraints between tables that
|
||||
//! represent relations.
|
||||
|
||||
use super::{bail, Ensure};
|
||||
use super::Ensure;
|
||||
use crate::oc_parser::*;
|
||||
use crate::or_parser;
|
||||
use log::{debug, info};
|
||||
|
@ -181,6 +181,16 @@ CREATE TABLE or_words (
|
|||
word_type TEXT,
|
||||
level TEXT
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE or_words_forms (
|
||||
id INTEGER PRIMARY KEY,
|
||||
word_id INTEGER NOT NULL,
|
||||
form_type TEXT,
|
||||
position TEXT,
|
||||
form TEXT,
|
||||
form_bare TEXT,
|
||||
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||
) STRICT;
|
||||
"#,
|
||||
)
|
||||
.ensure("setting up OpenRussian table schema failed");
|
||||
|
@ -215,3 +225,30 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
|
|||
|
||||
info!("inserted {} OpenRussian words", count);
|
||||
}
|
||||
|
||||
pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
|
||||
let mut stmt = conn
|
||||
.prepare_cached(
|
||||
"
|
||||
INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
|
||||
",
|
||||
)
|
||||
.ensure("failed to prepare OR word forms statement");
|
||||
let mut count = 0;
|
||||
|
||||
for form in forms {
|
||||
stmt.execute((
|
||||
form.id,
|
||||
form.word_id,
|
||||
form.form_type,
|
||||
form.position,
|
||||
form.form,
|
||||
form.form_bare,
|
||||
))
|
||||
.ensure("failed to insert OR word form");
|
||||
count += 1;
|
||||
}
|
||||
|
||||
info!("inserted {} OpenRussian word forms", count);
|
||||
}
|
||||
|
|
|
@ -210,12 +210,23 @@ fn open_russian(conn: &Connection, args: &Args) {
|
|||
|
||||
db_setup::initial_or_schema(conn);
|
||||
|
||||
{
|
||||
let tx = conn
|
||||
.unchecked_transaction()
|
||||
.ensure("failed to start transaction");
|
||||
|
||||
db_setup::insert_or_words(&tx, parser.words());
|
||||
tx.commit().ensure("OpenRussian words commit failed");
|
||||
}
|
||||
|
||||
{
|
||||
let tx = conn
|
||||
.unchecked_transaction()
|
||||
.ensure("failed to start transaction");
|
||||
|
||||
db_setup::insert_or_word_forms(&tx, parser.words_forms());
|
||||
tx.commit().ensure("OpenRussian word forms commit failed");
|
||||
}
|
||||
|
||||
info!("finished OpenRussian import");
|
||||
}
|
||||
|
|
|
@ -33,6 +33,17 @@ pub struct Word {
|
|||
pub created_at: String, // TODO: unknown
|
||||
}
|
||||
|
||||
/// A word form from the `words_forms` table.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct WordForm {
|
||||
pub id: usize,
|
||||
pub word_id: usize,
|
||||
pub form_type: String,
|
||||
pub position: String,
|
||||
pub form: String,
|
||||
pub form_bare: String,
|
||||
}
|
||||
|
||||
pub struct OpenRussianParser {
|
||||
or_directory: PathBuf,
|
||||
}
|
||||
|
@ -50,6 +61,10 @@ impl OpenRussianParser {
|
|||
self.parser_for("words.csv")
|
||||
}
|
||||
|
||||
pub fn words_forms(&self) -> DynIter<WordForm> {
|
||||
self.parser_for("words_forms.csv")
|
||||
}
|
||||
|
||||
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||
&self,
|
||||
file_name: &str,
|
||||
|
|
Loading…
Reference in a new issue