feat(corp/data-import): map OC word grammemes to OR form types

This table maps the grammemes for individual word forms (*not* for
lemmata in either corpus!) to the corresponding grammemes from the
other dataset.

These have drastically different shapes, so the mapping is not
perfect, but will help in determining which forms are intended to be
the same on both sides.

Change-Id: Ib0717e2f7a79d96bcb5e955a20f551e391fcd759
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7918
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
Autosubmit: tazjin <tazjin@tvl.su>
This commit is contained in:
Vincent Ambo 2023-01-24 01:54:29 +03:00 committed by clbot
parent ebd09edfe9
commit 8d594658ab
2 changed files with 85 additions and 0 deletions

View file

@ -111,6 +111,7 @@ use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
mod db_setup; mod db_setup;
mod mappings;
mod oc_parser; mod oc_parser;
mod or_parser; mod or_parser;

View file

@ -0,0 +1,84 @@
//! Manual mapping of some data structures in OC/OR corpora.
/// This maps the *names* of OpenRussian grammemes (the set of
/// `form_type` fields in the `word_forms` table) to the *names*
/// of OpenCorpora grammemes.
///
/// The names of the OR grammemes are much easier to understand in
/// general, as the OC ones seem to have strange acronyms in them,
/// however the OC ones are much more structured.
///
/// As these forms map to the word_forms table they lack the forms
/// attached to the lemmata.
pub const FORM_TYPES_GRAMMEMES: &'static [(&'static str, &'static [&'static str])] = &[
("ru_adj_comparative", &["Cmp2"]),
("ru_adj_f_acc", &["femn", "sing", "accs"]),
("ru_adj_f_dat", &["femn", "sing", "datv"]),
("ru_adj_f_gen", &["femn", "sing", "gent"]),
("ru_adj_f_inst", &["femn", "sing", "ablt"]),
("ru_adj_f_nom", &["femn", "sing", "nomn"]),
("ru_adj_f_prep", &["femn", "sing", "loct"]),
("ru_adj_m_acc", &["masc", "sing", "accs"]),
("ru_adj_m_dat", &["masc", "sing", "datv"]),
("ru_adj_m_gen", &["masc", "sing", "gent"]),
("ru_adj_m_inst", &["masc", "sing", "ablt"]),
("ru_adj_m_nom", &["masc", "sing", "nomn"]),
("ru_adj_m_prep", &["masc", "sing", "loct"]),
("ru_adj_n_acc", &["neut", "sing", "accs"]),
("ru_adj_n_dat", &["neut", "sing", "datv"]),
("ru_adj_n_gen", &["neut", "sing", "gent"]),
("ru_adj_n_inst", &["neut", "sing", "ablt"]),
("ru_adj_n_nom", &["neut", "sing", "nomn"]),
("ru_adj_n_prep", &["neut", "sing", "loct"]),
("ru_adj_pl_acc", &["plur", "accs"]),
("ru_adj_pl_dat", &["plur", "datv"]),
("ru_adj_pl_gen", &["plur", "gent"]),
("ru_adj_pl_inst", &["plur", "ablt"]),
("ru_adj_pl_nom", &["plur", "nomn"]),
("ru_adj_pl_prep", &["plur", "loct"]),
("ru_adj_short_f", &["femn", "sing"]),
("ru_adj_short_m", &["masc", "sing"]),
("ru_adj_short_n", &["neut", "sing"]),
("ru_adj_short_pl", &["plur"]),
("ru_noun_pl_acc", &["plur", "accs"]),
("ru_noun_pl_dat", &["plur", "datv"]),
("ru_noun_pl_gen", &["plur", "gent"]),
("ru_noun_pl_inst", &["plur", "ablt"]),
("ru_noun_pl_nom", &["plur", "nomn"]),
("ru_noun_pl_prep", &["plur", "loct"]),
("ru_noun_sg_acc", &["sing", "accs"]),
("ru_noun_sg_dat", &["sing", "datv"]),
("ru_noun_sg_gen", &["sing", "gent"]),
("ru_noun_sg_inst", &["sing", "ablt"]),
("ru_noun_sg_nom", &["sing", "nomn"]),
("ru_noun_sg_prep", &["sing", "loct"]),
("ru_verb_gerund_past", &["past", "V-sh"]),
("ru_verb_imperative_pl", &["plur", "impr"]),
("ru_verb_imperative_sg", &["sing", "impr"]),
("ru_verb_past_f", &["femn", "sing", "past"]),
("ru_verb_past_m", &["masc", "sing", "past"]),
("ru_verb_past_n", &["neut", "sing", "past"]),
("ru_verb_past_pl", &["plur", "past"]),
("ru_verb_presfut_pl1", &["plur", "1per", "pres"]),
("ru_verb_presfut_pl2", &["plur", "2per", "pres"]),
("ru_verb_presfut_pl3", &["plur", "3per", "pres"]),
("ru_verb_presfut_sg1", &["sing", "1per", "pres"]),
("ru_verb_presfut_sg2", &["sing", "2per", "pres"]),
("ru_verb_presfut_sg3", &["sing", "3per", "pres"]),
// Unclear items, probably only useful tags on lemmata
(
"ru_verb_gerund_present",
&["pres" /* prob. something missing? */],
),
(
"ru_adj_superlative",
&[/* TODO: unclear, random list of grammemes?! */],
),
("ru_base", &[/* TODO: unclear */]),
// These have no useful tags in the forms table, only gender &
// case tagging.
("ru_verb_participle_active_past", &[]),
("ru_verb_participle_active_present", &[]),
("ru_verb_participle_passive_past", &[]),
("ru_verb_participle_passive_present", &[]),
];