feat(corp/data-import): add import of OR 'words_forms' table
This is the full morphological set table for all the words from the lemmata table, which they don't call it that. Change-Id: I6f5be673c5f59f11e36bd8c8c935844a7d4fd170 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7894 Tested-by: BuildkiteCI Reviewed-by: tazjin <tazjin@tvl.su>
This commit is contained in:
parent
429c0d00c4
commit
8eeb5d3bcc
3 changed files with 69 additions and 6 deletions
|
@ -6,7 +6,7 @@
|
||||||
//! introduce things like foreign key constraints between tables that
|
//! introduce things like foreign key constraints between tables that
|
||||||
//! represent relations.
|
//! represent relations.
|
||||||
|
|
||||||
use super::{bail, Ensure};
|
use super::Ensure;
|
||||||
use crate::oc_parser::*;
|
use crate::oc_parser::*;
|
||||||
use crate::or_parser;
|
use crate::or_parser;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
|
@ -181,6 +181,16 @@ CREATE TABLE or_words (
|
||||||
word_type TEXT,
|
word_type TEXT,
|
||||||
level TEXT
|
level TEXT
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
|
CREATE TABLE or_words_forms (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
word_id INTEGER NOT NULL,
|
||||||
|
form_type TEXT,
|
||||||
|
position TEXT,
|
||||||
|
form TEXT,
|
||||||
|
form_bare TEXT,
|
||||||
|
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||||
|
) STRICT;
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.ensure("setting up OpenRussian table schema failed");
|
.ensure("setting up OpenRussian table schema failed");
|
||||||
|
@ -215,3 +225,30 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
|
||||||
|
|
||||||
info!("inserted {} OpenRussian words", count);
|
info!("inserted {} OpenRussian words", count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn insert_or_word_forms<I: Iterator<Item = or_parser::WordForm>>(conn: &Connection, forms: I) {
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached(
|
||||||
|
"
|
||||||
|
INSERT INTO or_words_forms (id, word_id, form_type, position, form, form_bare)
|
||||||
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6)
|
||||||
|
",
|
||||||
|
)
|
||||||
|
.ensure("failed to prepare OR word forms statement");
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
for form in forms {
|
||||||
|
stmt.execute((
|
||||||
|
form.id,
|
||||||
|
form.word_id,
|
||||||
|
form.form_type,
|
||||||
|
form.position,
|
||||||
|
form.form,
|
||||||
|
form.form_bare,
|
||||||
|
))
|
||||||
|
.ensure("failed to insert OR word form");
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("inserted {} OpenRussian word forms", count);
|
||||||
|
}
|
||||||
|
|
|
@ -210,12 +210,23 @@ fn open_russian(conn: &Connection, args: &Args) {
|
||||||
|
|
||||||
db_setup::initial_or_schema(conn);
|
db_setup::initial_or_schema(conn);
|
||||||
|
|
||||||
|
{
|
||||||
let tx = conn
|
let tx = conn
|
||||||
.unchecked_transaction()
|
.unchecked_transaction()
|
||||||
.ensure("failed to start transaction");
|
.ensure("failed to start transaction");
|
||||||
|
|
||||||
db_setup::insert_or_words(&tx, parser.words());
|
db_setup::insert_or_words(&tx, parser.words());
|
||||||
tx.commit().ensure("OpenRussian words commit failed");
|
tx.commit().ensure("OpenRussian words commit failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let tx = conn
|
||||||
|
.unchecked_transaction()
|
||||||
|
.ensure("failed to start transaction");
|
||||||
|
|
||||||
|
db_setup::insert_or_word_forms(&tx, parser.words_forms());
|
||||||
|
tx.commit().ensure("OpenRussian word forms commit failed");
|
||||||
|
}
|
||||||
|
|
||||||
info!("finished OpenRussian import");
|
info!("finished OpenRussian import");
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,6 +33,17 @@ pub struct Word {
|
||||||
pub created_at: String, // TODO: unknown
|
pub created_at: String, // TODO: unknown
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A word form from the `words_forms` table.
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct WordForm {
|
||||||
|
pub id: usize,
|
||||||
|
pub word_id: usize,
|
||||||
|
pub form_type: String,
|
||||||
|
pub position: String,
|
||||||
|
pub form: String,
|
||||||
|
pub form_bare: String,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct OpenRussianParser {
|
pub struct OpenRussianParser {
|
||||||
or_directory: PathBuf,
|
or_directory: PathBuf,
|
||||||
}
|
}
|
||||||
|
@ -50,6 +61,10 @@ impl OpenRussianParser {
|
||||||
self.parser_for("words.csv")
|
self.parser_for("words.csv")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn words_forms(&self) -> DynIter<WordForm> {
|
||||||
|
self.parser_for("words_forms.csv")
|
||||||
|
}
|
||||||
|
|
||||||
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||||
&self,
|
&self,
|
||||||
file_name: &str,
|
file_name: &str,
|
||||||
|
|
Loading…
Reference in a new issue