feat(corp/data-import): add import of OR 'translations' table
The original dataset contains translations into different languages, but only the English ones are imported here. Note that translations are for lemmata only. Change-Id: Ifb9c32c25fda44c38ad899efca9d205c520c0fa3 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7895 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
2b308c64b9
commit
ed8dd4acd7
3 changed files with 70 additions and 0 deletions
|
@ -191,6 +191,16 @@ CREATE TABLE or_words_forms (
|
||||||
form_bare TEXT,
|
form_bare TEXT,
|
||||||
FOREIGN KEY(word_id) REFERENCES words(id)
|
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
|
CREATE TABLE or_translations (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
word_id INTEGER NOT NULL,
|
||||||
|
translation TEXT,
|
||||||
|
example_ru TEXT,
|
||||||
|
example_tl TEXT,
|
||||||
|
info TEXT,
|
||||||
|
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||||
|
) STRICT;
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.ensure("setting up OpenRussian table schema failed");
|
.ensure("setting up OpenRussian table schema failed");
|
||||||
|
@ -252,3 +262,37 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6)
|
||||||
|
|
||||||
info!("inserted {} OpenRussian word forms", count);
|
info!("inserted {} OpenRussian word forms", count);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
|
||||||
|
conn: &Connection,
|
||||||
|
translations: I,
|
||||||
|
) {
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached(
|
||||||
|
"INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
|
||||||
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||||
|
)
|
||||||
|
.ensure("failed to prepare OR translation statement");
|
||||||
|
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
for tl in translations {
|
||||||
|
if tl.lang != "en" {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt.execute((
|
||||||
|
tl.id,
|
||||||
|
tl.word_id,
|
||||||
|
tl.tl,
|
||||||
|
tl.example_ru,
|
||||||
|
tl.example_tl,
|
||||||
|
tl.info,
|
||||||
|
))
|
||||||
|
.ensure("failed to insert OR translation");
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("inserted {} OpenRussian translations", count);
|
||||||
|
}
|
||||||
|
|
|
@ -228,6 +228,15 @@ fn open_russian(conn: &Connection, args: &Args) {
|
||||||
tx.commit().ensure("OpenRussian word forms commit failed");
|
tx.commit().ensure("OpenRussian word forms commit failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let tx = conn
|
||||||
|
.unchecked_transaction()
|
||||||
|
.ensure("failed to start transaction");
|
||||||
|
|
||||||
|
db_setup::insert_or_translations(&tx, parser.translations());
|
||||||
|
tx.commit().ensure("OpenRussian translations commit failed");
|
||||||
|
}
|
||||||
|
|
||||||
info!("finished OpenRussian import");
|
info!("finished OpenRussian import");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -44,6 +44,19 @@ pub struct WordForm {
|
||||||
pub form_bare: String,
|
pub form_bare: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// A translation from the `translations` table.
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Translation {
|
||||||
|
pub id: usize,
|
||||||
|
pub lang: String,
|
||||||
|
pub word_id: usize,
|
||||||
|
pub position: String,
|
||||||
|
pub tl: String, // unknown
|
||||||
|
pub example_ru: String,
|
||||||
|
pub example_tl: String,
|
||||||
|
pub info: String,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct OpenRussianParser {
|
pub struct OpenRussianParser {
|
||||||
or_directory: PathBuf,
|
or_directory: PathBuf,
|
||||||
}
|
}
|
||||||
|
@ -65,6 +78,10 @@ impl OpenRussianParser {
|
||||||
self.parser_for("words_forms.csv")
|
self.parser_for("words_forms.csv")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn translations(&self) -> DynIter<Translation> {
|
||||||
|
self.parser_for("translations.csv")
|
||||||
|
}
|
||||||
|
|
||||||
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||||
&self,
|
&self,
|
||||||
file_name: &str,
|
file_name: &str,
|
||||||
|
|
Loading…
Reference in a new issue