chore(corp/data-import): namespace tables for OpenCorpora data
I'm changing strategies to importing both OC and another dataset before continuing to normalise the data, as it might be easier to do in a set of table-constructing queries inside of SQLite with all raw data in place. Change-Id: I26b41af80586fc1bfd8e26a6be20579068a82507 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7872 Autosubmit: tazjin <tazjin@tvl.su> Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
9822fa387a
commit
db26825eec
2 changed files with 22 additions and 22 deletions
|
@ -12,11 +12,11 @@ use log::{debug, info};
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
|
|
||||||
/// Sets up an initial schema which matches the OpenCorpora data.
|
/// Sets up an initial schema which matches the OpenCorpora data.
|
||||||
pub fn initial_schema(conn: &Connection) {
|
pub fn initial_oc_schema(conn: &Connection) {
|
||||||
conn.execute_batch(
|
conn.execute_batch(
|
||||||
r#"
|
r#"
|
||||||
-- table for plain import of grammemes from XML
|
-- table for plain import of grammemes from XML
|
||||||
CREATE TABLE grammemes (
|
CREATE TABLE oc_grammemes (
|
||||||
name TEXT PRIMARY KEY,
|
name TEXT PRIMARY KEY,
|
||||||
parent TEXT,
|
parent TEXT,
|
||||||
alias TEXT,
|
alias TEXT,
|
||||||
|
@ -24,47 +24,47 @@ CREATE TABLE grammemes (
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for plain import of lemmas (*not* their variations!)
|
-- table for plain import of lemmas (*not* their variations!)
|
||||||
CREATE TABLE lemmas (
|
CREATE TABLE oc_lemmas (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
lemma TEXT NOT NULL
|
lemma TEXT NOT NULL
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for relationship between grammemes and lemmas
|
-- table for relationship between grammemes and lemmas
|
||||||
CREATE TABLE lemma_grammemes (
|
CREATE TABLE oc_lemma_grammemes (
|
||||||
lemma INTEGER,
|
lemma INTEGER,
|
||||||
grammeme TEXT NOT NULL,
|
grammeme TEXT NOT NULL,
|
||||||
FOREIGN KEY(lemma) REFERENCES lemmas(id)
|
FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for all words, i.e. including variations of lemmata
|
-- table for all words, i.e. including variations of lemmata
|
||||||
CREATE TABLE words (
|
CREATE TABLE oc_words (
|
||||||
lemma INTEGER NOT NULL,
|
lemma INTEGER NOT NULL,
|
||||||
word TEXT NOT NULL,
|
word TEXT NOT NULL,
|
||||||
FOREIGN KEY(lemma) REFERENCES lemmas(id)
|
FOREIGN KEY(lemma) REFERENCES oc_lemmas(id)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for relationship between words and grammemes
|
-- table for relationship between words and grammemes
|
||||||
CREATE TABLE word_grammemes (
|
CREATE TABLE oc_word_grammemes (
|
||||||
word INTEGER NOT NULL,
|
word INTEGER NOT NULL,
|
||||||
grammeme TEXT NOT NULL,
|
grammeme TEXT NOT NULL,
|
||||||
FOREIGN KEY(word) REFERENCES words(ROWID)
|
FOREIGN KEY(word) REFERENCES oc_words(ROWID)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for link types
|
-- table for link types
|
||||||
CREATE TABLE link_types (
|
CREATE TABLE oc_link_types (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
name TEXT
|
name TEXT
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
-- table for links between lemmata
|
-- table for links between lemmata
|
||||||
CREATE TABLE links (
|
CREATE TABLE oc_links (
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
link_type INTEGER NOT NULL,
|
link_type INTEGER NOT NULL,
|
||||||
from_lemma INTEGER NOT NULL,
|
from_lemma INTEGER NOT NULL,
|
||||||
to_lemma INTEGER NOT NULL,
|
to_lemma INTEGER NOT NULL,
|
||||||
FOREIGN KEY(link_type) REFERENCES link_types(id),
|
FOREIGN KEY(link_type) REFERENCES oc_link_types(id),
|
||||||
FOREIGN KEY(from_lemma) REFERENCES lemmas(id),
|
FOREIGN KEY(from_lemma) REFERENCES oc_lemmas(id),
|
||||||
FOREIGN KEY(to_lemma) REFERENCES lemmas(id)
|
FOREIGN KEY(to_lemma) REFERENCES oc_lemmas(id)
|
||||||
) STRICT;
|
) STRICT;
|
||||||
|
|
||||||
"#,
|
"#,
|
||||||
|
@ -79,7 +79,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
match elem {
|
match elem {
|
||||||
OcElement::Grammeme(grammeme) => {
|
OcElement::Grammeme(grammeme) => {
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
|
"INSERT INTO oc_grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
|
||||||
(
|
(
|
||||||
&grammeme.name,
|
&grammeme.name,
|
||||||
&grammeme.parent,
|
&grammeme.parent,
|
||||||
|
@ -96,7 +96,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
|
|
||||||
OcElement::LinkType(lt) => {
|
OcElement::LinkType(lt) => {
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"INSERT INTO link_types (id, name) VALUES (?1, ?2)",
|
"INSERT INTO oc_link_types (id, name) VALUES (?1, ?2)",
|
||||||
(<.id, <.name),
|
(<.id, <.name),
|
||||||
)
|
)
|
||||||
.ensure("failed to insert link type");
|
.ensure("failed to insert link type");
|
||||||
|
@ -107,7 +107,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
OcElement::Link(link) => {
|
OcElement::Link(link) => {
|
||||||
let mut stmt = conn
|
let mut stmt = conn
|
||||||
.prepare_cached(
|
.prepare_cached(
|
||||||
"INSERT INTO links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
|
"INSERT INTO oc_links (id, link_type, from_lemma, to_lemma) VALUES (?1, ?2, ?3, ?4)",
|
||||||
)
|
)
|
||||||
.ensure("failed to prepare link statement");
|
.ensure("failed to prepare link statement");
|
||||||
|
|
||||||
|
@ -124,7 +124,7 @@ pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
||||||
// insert the lemma itself
|
// insert the lemma itself
|
||||||
let mut stmt = conn
|
let mut stmt = conn
|
||||||
.prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)")
|
.prepare_cached("INSERT INTO oc_lemmas (id, lemma) VALUES (?1, ?2)")
|
||||||
.ensure("failed to prepare statement");
|
.ensure("failed to prepare statement");
|
||||||
|
|
||||||
stmt.execute((&lemma.id, &lemma.lemma.word))
|
stmt.execute((&lemma.id, &lemma.lemma.word))
|
||||||
|
@ -132,7 +132,7 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
||||||
|
|
||||||
// followed by its relations to the grammemes set
|
// followed by its relations to the grammemes set
|
||||||
let mut stmt = conn
|
let mut stmt = conn
|
||||||
.prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
|
.prepare_cached("INSERT INTO oc_lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
|
||||||
.ensure("failed to prepare statement");
|
.ensure("failed to prepare statement");
|
||||||
|
|
||||||
for grammeme in lemma.grammemes {
|
for grammeme in lemma.grammemes {
|
||||||
|
@ -142,11 +142,11 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
||||||
|
|
||||||
// followed by all of its variations ...
|
// followed by all of its variations ...
|
||||||
let mut word_insert = conn
|
let mut word_insert = conn
|
||||||
.prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)")
|
.prepare_cached("INSERT INTO oc_words (lemma, word) VALUES (?1, ?2)")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let mut word_grammeme = conn
|
let mut word_grammeme = conn
|
||||||
.prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)")
|
.prepare_cached("INSERT INTO oc_word_grammemes (word, grammeme) VALUES (?1, ?2)")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
for variation in lemma.variations {
|
for variation in lemma.variations {
|
||||||
|
|
|
@ -89,7 +89,7 @@ fn main() {
|
||||||
|
|
||||||
let conn = Connection::open(output_path).ensure("failed to open DB connection");
|
let conn = Connection::open(output_path).ensure("failed to open DB connection");
|
||||||
|
|
||||||
db_setup::initial_schema(&conn);
|
db_setup::initial_oc_schema(&conn);
|
||||||
|
|
||||||
// afterwards:
|
// afterwards:
|
||||||
// add actual IDs to grammemes
|
// add actual IDs to grammemes
|
||||||
|
|
Loading…
Reference in a new issue