feat(corp/data-import): add import of OR 'translations' table

The original dataset contains translations into different languages,
but only the English ones are imported here.

Note that translations are for lemmata only.

Change-Id: Ifb9c32c25fda44c38ad899efca9d205c520c0fa3
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7895
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-01-21 21:17:58 +03:00 committed by tazjin
parent 2b308c64b9
commit ed8dd4acd7
3 changed files with 70 additions and 0 deletions

View file

@ -191,6 +191,16 @@ CREATE TABLE or_words_forms (
form_bare TEXT,
FOREIGN KEY(word_id) REFERENCES words(id)
) STRICT;
CREATE TABLE or_translations (
id INTEGER PRIMARY KEY,
word_id INTEGER NOT NULL,
translation TEXT,
example_ru TEXT,
example_tl TEXT,
info TEXT,
FOREIGN KEY(word_id) REFERENCES words(id)
) STRICT;
"#,
)
.ensure("setting up OpenRussian table schema failed");
@ -252,3 +262,37 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6)
info!("inserted {} OpenRussian word forms", count);
}
pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
conn: &Connection,
translations: I,
) {
let mut stmt = conn
.prepare_cached(
"INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
)
.ensure("failed to prepare OR translation statement");
let mut count = 0;
for tl in translations {
if tl.lang != "en" {
continue;
}
stmt.execute((
tl.id,
tl.word_id,
tl.tl,
tl.example_ru,
tl.example_tl,
tl.info,
))
.ensure("failed to insert OR translation");
count += 1;
}
info!("inserted {} OpenRussian translations", count);
}

View file

@ -228,6 +228,15 @@ fn open_russian(conn: &Connection, args: &Args) {
tx.commit().ensure("OpenRussian word forms commit failed");
}
{
let tx = conn
.unchecked_transaction()
.ensure("failed to start transaction");
db_setup::insert_or_translations(&tx, parser.translations());
tx.commit().ensure("OpenRussian translations commit failed");
}
info!("finished OpenRussian import");
}

View file

@ -44,6 +44,19 @@ pub struct WordForm {
pub form_bare: String,
}
/// A translation from the `translations` table.
#[derive(Debug, Deserialize)]
pub struct Translation {
pub id: usize,
pub lang: String,
pub word_id: usize,
pub position: String,
pub tl: String, // unknown
pub example_ru: String,
pub example_tl: String,
pub info: String,
}
pub struct OpenRussianParser {
or_directory: PathBuf,
}
@ -65,6 +78,10 @@ impl OpenRussianParser {
self.parser_for("words_forms.csv")
}
pub fn translations(&self) -> DynIter<Translation> {
self.parser_for("translations.csv")
}
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
&self,
file_name: &str,