feat(corp/data-import): map OC lemma grammemes to OR form types
Change-Id: Ie804d185269336b0d9fe417754e5e795918e65b8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7923 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
8d594658ab
commit
80723b708d
1 changed files with 103 additions and 15 deletions
|
@ -1,16 +1,103 @@
|
|||
//! Manual mapping of some data structures in OC/OR corpora.
|
||||
|
||||
/// This maps the *names* of OpenRussian grammemes (the set of
|
||||
/// `form_type` fields in the `word_forms` table) to the *names*
|
||||
/// of OpenCorpora grammemes.
|
||||
/// Maps the *names* of OpenRussian grammemes (the `form_type` fields
|
||||
/// in the `or_word_forms` table) to the *set* of OpenCorpora
|
||||
/// grammemes attached to them corresponding lemma in the `oc_lemmas`
|
||||
/// table.
|
||||
///
|
||||
/// The names of the OR grammemes are much easier to understand in
|
||||
/// general, as the OC ones seem to have strange acronyms in them,
|
||||
/// however the OC ones are much more structured.
|
||||
/// This *only* includes grammatical information about the lemma of
|
||||
/// the word (such as whether it is a verb or other type), but *not*
|
||||
/// information about the specific instance of the word (such as its
|
||||
/// gender).
|
||||
///
|
||||
/// As these forms map to the word_forms table they lack the forms
|
||||
/// attached to the lemmata.
|
||||
pub const FORM_TYPES_GRAMMEMES: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
/// Correctly corresponding these requires use of all mapping tables.
|
||||
pub const FORMS_LEMMATA_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
("ru_adj_comparative", &["COMP"]),
|
||||
("ru_adj_superlative", &["ADJF", "Supr"]),
|
||||
("ru_adj_f_acc", &["ADJF"]),
|
||||
("ru_adj_f_dat", &["ADJF"]),
|
||||
("ru_adj_f_gen", &["ADJF"]),
|
||||
("ru_adj_f_inst", &["ADJF"]),
|
||||
("ru_adj_f_nom", &["ADJF"]),
|
||||
("ru_adj_f_prep", &["ADJF"]),
|
||||
("ru_adj_m_acc", &["ADJF"]),
|
||||
("ru_adj_m_dat", &["ADJF"]),
|
||||
("ru_adj_m_gen", &["ADJF"]),
|
||||
("ru_adj_m_inst", &["ADJF"]),
|
||||
("ru_adj_m_nom", &["ADJF"]),
|
||||
("ru_adj_m_prep", &["ADJF"]),
|
||||
("ru_adj_n_acc", &["ADJF"]),
|
||||
("ru_adj_n_dat", &["ADJF"]),
|
||||
("ru_adj_n_gen", &["ADJF"]),
|
||||
("ru_adj_n_inst", &["ADJF"]),
|
||||
("ru_adj_n_nom", &["ADJF"]),
|
||||
("ru_adj_n_prep", &["ADJF"]),
|
||||
("ru_adj_pl_acc", &["ADJF"]),
|
||||
("ru_adj_pl_dat", &["ADJF"]),
|
||||
("ru_adj_pl_gen", &["ADJF"]),
|
||||
("ru_adj_pl_inst", &["ADJF"]),
|
||||
("ru_adj_pl_nom", &["ADJF"]),
|
||||
("ru_adj_pl_prep", &["ADJF"]),
|
||||
("ru_adj_short_f", &["ADJS"]),
|
||||
("ru_adj_short_m", &["ADJS"]),
|
||||
("ru_adj_short_n", &["ADJS"]),
|
||||
("ru_adj_short_pl", &["ADJS"]),
|
||||
("ru_noun_pl_acc", &["NOUN"]),
|
||||
("ru_noun_pl_dat", &["NOUN"]),
|
||||
("ru_noun_pl_gen", &["NOUN"]),
|
||||
("ru_noun_pl_inst", &["NOUN"]),
|
||||
("ru_noun_pl_nom", &["NOUN"]),
|
||||
("ru_noun_pl_prep", &["NOUN"]),
|
||||
("ru_noun_sg_acc", &["NOUN"]),
|
||||
("ru_noun_sg_dat", &["NOUN"]),
|
||||
("ru_noun_sg_gen", &["NOUN"]),
|
||||
("ru_noun_sg_inst", &["NOUN"]),
|
||||
("ru_noun_sg_nom", &["NOUN"]),
|
||||
("ru_noun_sg_prep", &["NOUN"]),
|
||||
("ru_verb_gerund_past", &["GRND"]),
|
||||
("ru_verb_gerund_present", &["GRND"]),
|
||||
("ru_verb_imperative_pl", &["VERB"]),
|
||||
("ru_verb_imperative_sg", &["VERB"]),
|
||||
("ru_verb_past_f", &["VERB"]),
|
||||
("ru_verb_past_m", &["VERB"]),
|
||||
("ru_verb_past_n", &["VERB"]),
|
||||
("ru_verb_past_pl", &["VERB"]),
|
||||
("ru_verb_presfut_pl1", &["VERB"]),
|
||||
("ru_verb_presfut_pl2", &["VERB"]),
|
||||
("ru_verb_presfut_pl3", &["VERB"]),
|
||||
("ru_verb_presfut_sg1", &["VERB"]),
|
||||
("ru_verb_presfut_sg2", &["VERB"]),
|
||||
("ru_verb_presfut_sg3", &["VERB"]),
|
||||
(
|
||||
"ru_base",
|
||||
&[ /* nothing consistent, except often 'Fixd' */ ],
|
||||
),
|
||||
("ru_verb_participle_active_past", &["PRTF", "past", "actv"]),
|
||||
(
|
||||
"ru_verb_participle_active_present",
|
||||
&["PRTF", "pres", "actv"],
|
||||
),
|
||||
(
|
||||
"ru_verb_participle_passive_past",
|
||||
&["PRTF", "past", "passv"],
|
||||
),
|
||||
(
|
||||
"ru_verb_participle_passive_present",
|
||||
&["PRTF", "pres", "passv"],
|
||||
),
|
||||
];
|
||||
|
||||
/// Maps the *names* of OpenRussian grammemes (the `form_type` fields
|
||||
/// in the `or_word_forms` table) to the *set* of OpenCorpora
|
||||
/// grammemes attached to them corresponding words in the `oc_words`
|
||||
/// table.
|
||||
///
|
||||
/// This includes grammatical information about the "instance" of the
|
||||
/// word (such as its gender), but *not* the higher-level type
|
||||
/// information about its lemma.
|
||||
///
|
||||
/// Correctly corresponding these requires use of all mapping tables.
|
||||
pub const FORMS_WORDS_GRAMMEME_MAP: &'static [(&'static str, &'static [&'static str])] = &[
|
||||
("ru_adj_comparative", &["Cmp2"]),
|
||||
("ru_adj_f_acc", &["femn", "sing", "accs"]),
|
||||
("ru_adj_f_dat", &["femn", "sing", "datv"]),
|
||||
|
@ -59,12 +146,13 @@ pub const FORM_TYPES_GRAMMEMES: &'static [(&'static str, &'static [&'static str]
|
|||
("ru_verb_past_m", &["masc", "sing", "past"]),
|
||||
("ru_verb_past_n", &["neut", "sing", "past"]),
|
||||
("ru_verb_past_pl", &["plur", "past"]),
|
||||
("ru_verb_presfut_pl1", &["plur", "1per", "pres"]),
|
||||
("ru_verb_presfut_pl2", &["plur", "2per", "pres"]),
|
||||
("ru_verb_presfut_pl3", &["plur", "3per", "pres"]),
|
||||
("ru_verb_presfut_sg1", &["sing", "1per", "pres"]),
|
||||
("ru_verb_presfut_sg2", &["sing", "2per", "pres"]),
|
||||
("ru_verb_presfut_sg3", &["sing", "3per", "pres"]),
|
||||
// these also contain "pres" or "futr", depending on the verb.
|
||||
("ru_verb_presfut_pl1", &["plur", "1per"]),
|
||||
("ru_verb_presfut_pl2", &["plur", "2per"]),
|
||||
("ru_verb_presfut_pl3", &["plur", "3per"]),
|
||||
("ru_verb_presfut_sg1", &["sing", "1per"]),
|
||||
("ru_verb_presfut_sg2", &["sing", "2per"]),
|
||||
("ru_verb_presfut_sg3", &["sing", "3per"]),
|
||||
// Unclear items, probably only useful tags on lemmata
|
||||
(
|
||||
"ru_verb_gerund_present",
|
||||
|
|
Loading…
Reference in a new issue