feat(corp/data-import): add import of OR 'translations' table
The original dataset contains translations into different languages, but only the English ones are imported here. Note that translations are for lemmata only. Change-Id: Ifb9c32c25fda44c38ad899efca9d205c520c0fa3 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7895 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
2b308c64b9
commit
ed8dd4acd7
3 changed files with 70 additions and 0 deletions
|
@ -191,6 +191,16 @@ CREATE TABLE or_words_forms (
|
|||
form_bare TEXT,
|
||||
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||
) STRICT;
|
||||
|
||||
CREATE TABLE or_translations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
word_id INTEGER NOT NULL,
|
||||
translation TEXT,
|
||||
example_ru TEXT,
|
||||
example_tl TEXT,
|
||||
info TEXT,
|
||||
FOREIGN KEY(word_id) REFERENCES words(id)
|
||||
) STRICT;
|
||||
"#,
|
||||
)
|
||||
.ensure("setting up OpenRussian table schema failed");
|
||||
|
@ -252,3 +262,37 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6)
|
|||
|
||||
info!("inserted {} OpenRussian word forms", count);
|
||||
}
|
||||
|
||||
pub fn insert_or_translations<I: Iterator<Item = or_parser::Translation>>(
|
||||
conn: &Connection,
|
||||
translations: I,
|
||||
) {
|
||||
let mut stmt = conn
|
||||
.prepare_cached(
|
||||
"INSERT INTO or_translations (id, word_id, translation, example_ru, example_tl, info)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
)
|
||||
.ensure("failed to prepare OR translation statement");
|
||||
|
||||
let mut count = 0;
|
||||
|
||||
for tl in translations {
|
||||
if tl.lang != "en" {
|
||||
continue;
|
||||
}
|
||||
|
||||
stmt.execute((
|
||||
tl.id,
|
||||
tl.word_id,
|
||||
tl.tl,
|
||||
tl.example_ru,
|
||||
tl.example_tl,
|
||||
tl.info,
|
||||
))
|
||||
.ensure("failed to insert OR translation");
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
info!("inserted {} OpenRussian translations", count);
|
||||
}
|
||||
|
|
|
@ -228,6 +228,15 @@ fn open_russian(conn: &Connection, args: &Args) {
|
|||
tx.commit().ensure("OpenRussian word forms commit failed");
|
||||
}
|
||||
|
||||
{
|
||||
let tx = conn
|
||||
.unchecked_transaction()
|
||||
.ensure("failed to start transaction");
|
||||
|
||||
db_setup::insert_or_translations(&tx, parser.translations());
|
||||
tx.commit().ensure("OpenRussian translations commit failed");
|
||||
}
|
||||
|
||||
info!("finished OpenRussian import");
|
||||
}
|
||||
|
||||
|
|
|
@ -44,6 +44,19 @@ pub struct WordForm {
|
|||
pub form_bare: String,
|
||||
}
|
||||
|
||||
/// A translation from the `translations` table.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct Translation {
|
||||
pub id: usize,
|
||||
pub lang: String,
|
||||
pub word_id: usize,
|
||||
pub position: String,
|
||||
pub tl: String, // unknown
|
||||
pub example_ru: String,
|
||||
pub example_tl: String,
|
||||
pub info: String,
|
||||
}
|
||||
|
||||
pub struct OpenRussianParser {
|
||||
or_directory: PathBuf,
|
||||
}
|
||||
|
@ -65,6 +78,10 @@ impl OpenRussianParser {
|
|||
self.parser_for("words_forms.csv")
|
||||
}
|
||||
|
||||
pub fn translations(&self) -> DynIter<Translation> {
|
||||
self.parser_for("translations.csv")
|
||||
}
|
||||
|
||||
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||
&self,
|
||||
file_name: &str,
|
||||
|
|
Loading…
Reference in a new issue