chore(corp/data-import): namespace tables for OpenCorpora data

I'm changing strategies to importing both OC and another dataset
before continuing to normalise the data, as it might be easier to do
in a set of table-constructing queries inside of SQLite with all raw
data in place.

Change-Id: I26b41af80586fc1bfd8e26a6be20579068a82507
Reviewed-on: https://cl.tvl.fyi/c/depot/+/7872
Autosubmit: tazjin <tazjin@tvl.su>
Reviewed-by: tazjin <tazjin@tvl.su>
Tested-by: BuildkiteCI
This commit is contained in:
Vincent Ambo 2023-01-19 00:20:21 +03:00 committed by clbot
parent 9822fa387a
commit db26825eec
2 changed files with 22 additions and 22 deletions

View file

@ -12,11 +12,11 @@ use log::{debug, info};
use rusqlite::Connection; use rusqlite::Connection;
/// Sets up an initial schema which matches the OpenCorpora data. /// Sets up an initial schema which matches the OpenCorpora data.
pub fn initial_schema(conn: &Connection) { pub fn initial_oc_schema(conn: &Connection) {
conn.execute_batch( conn.execute_batch(
r#" r#"
-- table for plain import of grammemes from XML -- table for plain import of grammemes from XML
CREATE TABLE grammemes ( CREATE TABLE oc_grammemes (
name TEXT PRIMARY KEY, name TEXT PRIMARY KEY,
parent TEXT, parent TEXT,
alias TEXT, alias TEXT,
@ -24,47 +24,47 @@ CREATE TABLE grammemes (
) STRICT; ) STRICT;
-- table for plain import of lemmas (*not* their variations!) -- table for plain import of lemmas (*not* their variations!)
CREATE TABLE lemmas ( CREATE TABLE oc_lemmas (
id INTEGER PRIMARY KEY, id INTEGER PRIMARY KEY,
lemma TEXT NOT NULL lemma TEXT NOT NULL
) STRICT; ) STRICT;
-- table for relationship between grammemes and lemmas -- table for relationship between grammemes and lemmas
CREATE TABLE lemma_grammemes ( CREATE TABLE oc_lemma_grammemes (
lemma INTEGER, lemma INTEGER,
grammeme TEXT NOT NULL, grammeme TEXT NOT NULL,
FOREIGN KEY(lemma) REFERENCES lemmas(id) FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
) STRICT; ) STRICT;
-- table for all words, i.e. including variations of lemmata -- table for all words, i.e. including variations of lemmata
CREATE TABLE words ( CREATE TABLE oc_words (
lemma INTEGER NOT NULL, lemma INTEGER NOT NULL,
word TEXT NOT NULL, word TEXT NOT NULL,
FOREIGN KEY(lemma) REFERENCES lemmas(id) FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
) STRICT; ) STRICT;
-- table for relationship between words and grammemes -- table for relationship between words and grammemes
CREATE TABLE word_grammemes ( CREATE TABLE oc_word_grammemes (
word INTEGER NOT NULL, word INTEGER NOT NULL,
grammeme TEXT NOT NULL, grammeme TEXT NOT NULL,
FOREIGN KEY(word) REFERENCES words(ROWID) FOREIGN KEY(word) REFERENCES oc_words(ROWID)
) STRICT; ) STRICT;
-- table for link types -- table for link types
CREATE TABLE link_types ( CREATE TABLE oc_link_types (
id INTEGER PRIMARY KEY, id INTEGER PRIMARY KEY,
name TEXT name TEXT
) STRICT; ) STRICT;
-- table for links between lemmata -- table for links between lemmata
CREATE TABLE links ( CREATE TABLE oc_links (
id INTEGER PRIMARY KEY, id INTEGER PRIMARY KEY,
link_type INTEGER NOT NULL, link_type INTEGER NOT NULL,
from_lemma INTEGER NOT NULL, from_lemma INTEGER NOT NULL,
to_lemma INTEGER NOT NULL, to_lemma INTEGER NOT NULL,
FOREIGN KEY(link_type) REFERENCES link_types(id), FOREIGN KEY(link_type) REFERENCES oc_link_types(id),
FOREIGN KEY(from_lemma) REFERENCES lemmas(id), FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id),
FOREIGN KEY(to_lemma) REFERENCES lemmas(id) FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id)
) STRICT; ) STRICT;
"#, "#,
@ -79,7 +79,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
match elem { match elem {
OcElement::Grammeme(grammeme) => { OcElement::Grammeme(grammeme) => {
conn.execute( conn.execute(
"INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)", "INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
( (
&grammeme.name, &grammeme.name,
&grammeme.parent, &grammeme.parent,
@ -96,7 +96,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
OcElement::LinkType(lt) => { OcElement::LinkType(lt) => {
conn.execute( conn.execute(
"INSERT INTO link_types (id, name) VALUES (?1, ?2)", "INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)",
(&lt.id, &lt.name), (&lt.id, &lt.name),
) )
.ensure("failed to insert link type"); .ensure("failed to insert link type");
@ -107,7 +107,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
OcElement::Link(link) => { OcElement::Link(link) => {
let mut stmt = conn let mut stmt = conn
.prepare_cached( .prepare_cached(
"INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)", "INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
) )
.ensure("failed to prepare link statement"); .ensure("failed to prepare link statement");
@ -124,7 +124,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
fn insert_lemma(conn: &Connection, lemma: Lemma) { fn insert_lemma(conn: &Connection, lemma: Lemma) {
// insert the lemma itself // insert the lemma itself
let mut stmt = conn let mut stmt = conn
.prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)") .prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)")
.ensure("failed to prepare statement"); .ensure("failed to prepare statement");
stmt.execute((&lemma.id, &lemma.lemma.word)) stmt.execute((&lemma.id, &lemma.lemma.word))
@ -132,7 +132,7 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
// followed by its relations to the grammemes set // followed by its relations to the grammemes set
let mut stmt = conn let mut stmt = conn
.prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)") .prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
.ensure("failed to prepare statement"); .ensure("failed to prepare statement");
for grammeme in lemma.grammemes { for grammeme in lemma.grammemes {
@ -142,11 +142,11 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
// followed by all of its variations ... // followed by all of its variations ...
let mut word_insert = conn let mut word_insert = conn
.prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)") .prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)")
.unwrap(); .unwrap();
let mut word_grammeme = conn let mut word_grammeme = conn
.prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)") .prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)")
.unwrap(); .unwrap();
for variation in lemma.variations { for variation in lemma.variations {

View file

@ -89,7 +89,7 @@ fn main() {
let conn = Connection::open(output_path).ensure("failed to open DB connection"); let conn = Connection::open(output_path).ensure("failed to open DB connection");
db_setup::initial_schema(&conn); db_setup::initial_oc_schema(&conn);
// afterwards: // afterwards:
// add actual IDs to grammemes // add actual IDs to grammemes