feat(corp/data-import): insert OpenCorpora data into SQLite
This is an initial and kind of dumb table structure, but there's some massaging that needs to be done before this makes more sense. Change-Id: I441288b684ef86be507099bcc4ebf984598789c8 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7861 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
0196555f07
commit
6986aa5824
2 changed files with 155 additions and 9 deletions
128
corp/russian/data-import/src/db_setup.rs
Normal file
128
corp/russian/data-import/src/db_setup.rs
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
//! This module prepares the database layout.
|
||||||
|
//!
|
||||||
|
//! The XML import may be in an arbitrary order, so importing data is
|
||||||
|
//! a multi-step process where we first set up schemas matching the
|
||||||
|
//! data layout, import the data, and then modify the schema to
|
||||||
|
//! introduce things like foreign key constraints between tables that
|
||||||
|
//! represent relations.
|
||||||
|
|
||||||
|
use super::{bail, Ensure};
|
||||||
|
use crate::oc_parser::*;
|
||||||
|
use log::{debug, info};
|
||||||
|
use rusqlite::Connection;
|
||||||
|
|
||||||
|
/// Sets up an initial schema which matches the OpenCorpora data.
|
||||||
|
pub fn initial_schema(conn: &Connection) {
|
||||||
|
conn.execute_batch(
|
||||||
|
r#"
|
||||||
|
-- table for plain import of grammemes from XML
|
||||||
|
CREATE TABLE grammemes (
|
||||||
|
name TEXT PRIMARY KEY,
|
||||||
|
parent TEXT,
|
||||||
|
alias TEXT,
|
||||||
|
description TEXT
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for plain import of lemmas (*not* their variations!)
|
||||||
|
CREATE TABLE lemmas (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
lemma TEXT NOT NULL
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for relationship between grammemes and lemmas
|
||||||
|
CREATE TABLE lemma_grammemes (
|
||||||
|
lemma INTEGER,
|
||||||
|
grammeme TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(lemma) REFERENCES lemmas(id)
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for all words, i.e. including variations of lemmata
|
||||||
|
CREATE TABLE words (
|
||||||
|
lemma INTEGER NOT NULL,
|
||||||
|
word TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(lemma) REFERENCES lemmas(id)
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
|
-- table for relationship between words and grammemes
|
||||||
|
CREATE TABLE word_grammemes (
|
||||||
|
word INTEGER NOT NULL,
|
||||||
|
grammeme TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(word) REFERENCES words(ROWID)
|
||||||
|
) STRICT;
|
||||||
|
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.ensure("setting up initial table schema failed");
|
||||||
|
|
||||||
|
info!("set up initial table schema for OpenCorpora import");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Inserts a single OpenCorpora element into the initial table structure.
|
||||||
|
pub fn insert_oc_element(conn: &Connection, elem: OcElement) {
|
||||||
|
match elem {
|
||||||
|
OcElement::Grammeme(grammeme) => {
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO grammemes (name, parent, alias, description) VALUES (?1, ?2, ?3, ?4)",
|
||||||
|
(
|
||||||
|
&grammeme.name,
|
||||||
|
&grammeme.parent,
|
||||||
|
&grammeme.alias,
|
||||||
|
&grammeme.description,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.ensure("failed to insert grammeme");
|
||||||
|
|
||||||
|
debug!("inserted grammeme {}", grammeme.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
OcElement::Lemma(lemma) => insert_lemma(conn, lemma),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Insert a single lemma into the initial structure. This is somewhat
|
||||||
|
/// involved because it also establishes a bunch of relations.
|
||||||
|
fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
||||||
|
// insert the lemma itself
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached("INSERT INTO lemmas (id, lemma) VALUES (?1, ?2)")
|
||||||
|
.ensure("failed to prepare statement");
|
||||||
|
|
||||||
|
stmt.execute((&lemma.id, &lemma.lemma.word))
|
||||||
|
.ensure("failed to insert grammeme");
|
||||||
|
|
||||||
|
// followed by its relations to the grammemes set
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached("INSERT INTO lemma_grammemes (lemma, grammeme) VALUES (?1, ?2)")
|
||||||
|
.ensure("failed to prepare statement");
|
||||||
|
|
||||||
|
for grammeme in lemma.grammemes {
|
||||||
|
stmt.execute((&lemma.id, grammeme))
|
||||||
|
.ensure("failed to insert grammeme<>lemma relationship");
|
||||||
|
}
|
||||||
|
|
||||||
|
// followed by all of its variations ...
|
||||||
|
let mut word_insert = conn
|
||||||
|
.prepare_cached("INSERT INTO words (lemma, word) VALUES (?1, ?2)")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let mut word_grammeme = conn
|
||||||
|
.prepare_cached("INSERT INTO word_grammemes (word, grammeme) VALUES (?1, ?2)")
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for variation in lemma.variations {
|
||||||
|
// insert the word itself and get its rowid
|
||||||
|
word_insert
|
||||||
|
.execute((&lemma.id, &variation.word))
|
||||||
|
.ensure("failed to insert word");
|
||||||
|
let row_id = conn.last_insert_rowid();
|
||||||
|
|
||||||
|
// then insert its grammeme links
|
||||||
|
for grammeme in variation.grammemes {
|
||||||
|
word_grammeme
|
||||||
|
.execute((row_id, grammeme))
|
||||||
|
.ensure("failed to insert word<>grammeme link");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!("inserted lemma {}", lemma.id);
|
||||||
|
}
|
|
@ -55,11 +55,13 @@
|
||||||
//! between the lemmas "два" and "второй".
|
//! between the lemmas "два" and "второй".
|
||||||
|
|
||||||
use log::{error, info};
|
use log::{error, info};
|
||||||
|
use rusqlite::{Connection, Result};
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::fmt::Display;
|
use std::fmt::Display;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{BufReader, BufWriter, Write};
|
use std::io::BufReader;
|
||||||
|
|
||||||
|
mod db_setup;
|
||||||
mod oc_parser;
|
mod oc_parser;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
|
@ -77,18 +79,34 @@ fn main() {
|
||||||
|
|
||||||
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
||||||
|
|
||||||
let mut out = BufWriter::new(std::io::stdout().lock());
|
let conn = Connection::open("out.db").ensure("failed to open DB connection");
|
||||||
|
|
||||||
|
db_setup::initial_schema(&conn);
|
||||||
|
|
||||||
|
// afterwards:
|
||||||
|
// add actual IDs to grammemes
|
||||||
|
// properly reference keys internally
|
||||||
|
// add foreign key constraint on lemma_grammemes.grammeme
|
||||||
|
|
||||||
|
let mut tx = conn
|
||||||
|
.unchecked_transaction()
|
||||||
|
.ensure("failed to start transaction");
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
while let Some(elem) = parser.next_element() {
|
while let Some(elem) = parser.next_element() {
|
||||||
if let oc_parser::OcElement::Lemma(lemma) = elem {
|
// commit every 1000 things
|
||||||
if lemma.lemma.word == "тяжёлый" {
|
if count % 1000 == 0 {
|
||||||
writeln!(out, "{:?}", lemma).ensure("writing output failed");
|
tx.commit().ensure("transaction failed");
|
||||||
break;
|
tx = conn
|
||||||
}
|
.unchecked_transaction()
|
||||||
|
.ensure("failed to start new transaction");
|
||||||
|
info!("transaction committed at watermark {}", count);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
out.flush().ensure("flushing the out buffer failed");
|
db_setup::insert_oc_element(&tx, elem);
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// It's like `expect`, but through `log::error`.
|
/// It's like `expect`, but through `log::error`.
|
||||||
|
|
Loading…
Add table
Reference in a new issue