feat(corp/data-import): add import of OpenRussian 'words' table
This is actually the lemmata table of this corpus, not the forms of all words (they're in a separate table). Change-Id: I89a2c2817ccce840f47406fa2a636f4ed3f49154 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7893 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
ee0c0ee951
commit
429c0d00c4
6 changed files with 349 additions and 31 deletions
115
corp/russian/data-import/Cargo.lock
generated
115
corp/russian/data-import/Cargo.lock
generated
|
@ -28,6 +28,18 @@ version = "1.3.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bstr"
|
||||||
|
version = "0.2.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cc"
|
name = "cc"
|
||||||
version = "1.0.78"
|
version = "1.0.78"
|
||||||
|
@ -40,13 +52,37 @@ version = "1.0.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv"
|
||||||
|
version = "1.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
|
||||||
|
dependencies = [
|
||||||
|
"bstr",
|
||||||
|
"csv-core",
|
||||||
|
"itoa",
|
||||||
|
"ryu",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv-core"
|
||||||
|
version = "0.1.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "data-import"
|
name = "data-import"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"csv",
|
||||||
"env_logger",
|
"env_logger",
|
||||||
"log",
|
"log",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
|
"serde",
|
||||||
"xml-rs",
|
"xml-rs",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -162,6 +198,18 @@ dependencies = [
|
||||||
"windows-sys",
|
"windows-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itoa"
|
||||||
|
version = "0.4.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lazy_static"
|
||||||
|
version = "1.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libc"
|
name = "libc"
|
||||||
version = "0.2.139"
|
version = "0.2.139"
|
||||||
|
@ -211,6 +259,24 @@ version = "0.3.26"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
|
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.50"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.23"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex"
|
name = "regex"
|
||||||
version = "1.7.1"
|
version = "1.7.1"
|
||||||
|
@ -222,6 +288,12 @@ dependencies = [
|
||||||
"regex-syntax",
|
"regex-syntax",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.1.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-syntax"
|
name = "regex-syntax"
|
||||||
version = "0.6.28"
|
version = "0.6.28"
|
||||||
|
@ -256,12 +328,49 @@ dependencies = [
|
||||||
"windows-sys",
|
"windows-sys",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ryu"
|
||||||
|
version = "1.0.12"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde"
|
||||||
|
version = "1.0.152"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb"
|
||||||
|
dependencies = [
|
||||||
|
"serde_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde_derive"
|
||||||
|
version = "1.0.152"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "smallvec"
|
name = "smallvec"
|
||||||
version = "1.10.0"
|
version = "1.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "1.0.107"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termcolor"
|
name = "termcolor"
|
||||||
version = "1.2.0"
|
version = "1.2.0"
|
||||||
|
@ -271,6 +380,12 @@ dependencies = [
|
||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vcpkg"
|
name = "vcpkg"
|
||||||
version = "0.2.15"
|
version = "0.2.15"
|
||||||
|
|
|
@ -6,9 +6,11 @@ edition = "2021"
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
csv = "1.1"
|
||||||
env_logger = "0.10.0"
|
env_logger = "0.10.0"
|
||||||
log = "0.4.17"
|
log = "0.4.17"
|
||||||
rusqlite = "0.28"
|
rusqlite = "0.28"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
xml-rs = "0.8"
|
xml-rs = "0.8"
|
||||||
|
|
||||||
[profile.release-with-debug]
|
[profile.release-with-debug]
|
||||||
|
|
|
@ -19,6 +19,9 @@ let
|
||||||
${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out
|
${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
# mirrored input data from OpenRussian, as of 2023-01-17.
|
||||||
|
#
|
||||||
|
# This data is licensed under CC-BY-SA.
|
||||||
openRussianArchive = pkgs.fetchzip {
|
openRussianArchive = pkgs.fetchzip {
|
||||||
name = "openrussian-20230117";
|
name = "openrussian-20230117";
|
||||||
url = "https://tazj.in/blobs/openrussian-20230117.tar.xz";
|
url = "https://tazj.in/blobs/openrussian-20230117.tar.xz";
|
||||||
|
@ -43,8 +46,10 @@ lib.fix (self: depot.third_party.naersk.buildPackage {
|
||||||
inherit shell openCorpora;
|
inherit shell openCorpora;
|
||||||
|
|
||||||
# target that actually builds an entire database
|
# target that actually builds an entire database
|
||||||
database = pkgs.runCommand "tvl-russian-db.sqlite" { } ''
|
database = pkgs.runCommand "tvl-russian-db.sqlite"
|
||||||
${self}/bin/data-import ${openCorpora} $out
|
{
|
||||||
'';
|
OPENCORPORA_DATA = openCorpora;
|
||||||
|
OPENRUSSIAN_DATA = openRussianArchive;
|
||||||
|
} "${self}/bin/data-import --output $out";
|
||||||
};
|
};
|
||||||
})
|
})
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
use super::{bail, Ensure};
|
use super::{bail, Ensure};
|
||||||
use crate::oc_parser::*;
|
use crate::oc_parser::*;
|
||||||
|
use crate::or_parser;
|
||||||
use log::{debug, info};
|
use log::{debug, info};
|
||||||
use rusqlite::Connection;
|
use rusqlite::Connection;
|
||||||
|
|
||||||
|
@ -69,7 +70,7 @@ CREATE TABLE oc_links (
|
||||||
|
|
||||||
"#,
|
"#,
|
||||||
)
|
)
|
||||||
.ensure("setting up initial table schema failed");
|
.ensure("setting up OpenCorpora table schema failed");
|
||||||
|
|
||||||
info!("set up initial table schema for OpenCorpora import");
|
info!("set up initial table schema for OpenCorpora import");
|
||||||
}
|
}
|
||||||
|
@ -166,3 +167,51 @@ fn insert_lemma(conn: &Connection, lemma: Lemma) {
|
||||||
|
|
||||||
debug!("inserted lemma {}", lemma.id);
|
debug!("inserted lemma {}", lemma.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sets up an initial schema for the OpenRussian data.
|
||||||
|
pub fn initial_or_schema(conn: &Connection) {
|
||||||
|
conn.execute_batch(
|
||||||
|
r#"
|
||||||
|
CREATE TABLE or_words (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
bare TEXT NOT NULL,
|
||||||
|
accented TEXT,
|
||||||
|
derived_from_word_id INTEGER,
|
||||||
|
rank TEXT,
|
||||||
|
word_type TEXT,
|
||||||
|
level TEXT
|
||||||
|
) STRICT;
|
||||||
|
"#,
|
||||||
|
)
|
||||||
|
.ensure("setting up OpenRussian table schema failed");
|
||||||
|
|
||||||
|
info!("set up initial table schema for OpenRussian import");
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert_or_words<I: Iterator<Item = or_parser::Word>>(conn: &Connection, words: I) {
|
||||||
|
let mut stmt = conn
|
||||||
|
.prepare_cached(
|
||||||
|
"
|
||||||
|
INSERT INTO or_words (id, bare, accented, derived_from_word_id, rank, word_type, level)
|
||||||
|
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)
|
||||||
|
",
|
||||||
|
)
|
||||||
|
.ensure("failed to prepare OR words statement");
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
for word in words {
|
||||||
|
stmt.execute((
|
||||||
|
word.id,
|
||||||
|
word.bare,
|
||||||
|
word.accented,
|
||||||
|
word.derived_from_word_id,
|
||||||
|
word.rank,
|
||||||
|
word.word_type,
|
||||||
|
word.level,
|
||||||
|
))
|
||||||
|
.ensure("failed to insert OR word");
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("inserted {} OpenRussian words", count);
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
//! This program imports Russian language data from OpenCorpora and
|
//! This program imports Russian language data from OpenCorpora
|
||||||
//! OpenRussian ("Открытый корпус") into a SQLite database that can be
|
//! ("Открытый корпус") and OpenRussian into a SQLite database that
|
||||||
//! used for [//corp/russian][corp-russian] projects.
|
//! can be used for [//corp/russian][corp-russian] projects.
|
||||||
//!
|
//!
|
||||||
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
|
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
|
||||||
//!
|
//!
|
||||||
|
@ -112,42 +112,77 @@ use std::io::BufReader;
|
||||||
|
|
||||||
mod db_setup;
|
mod db_setup;
|
||||||
mod oc_parser;
|
mod oc_parser;
|
||||||
|
mod or_parser;
|
||||||
|
|
||||||
fn main() {
|
struct Args {
|
||||||
env_logger::builder()
|
output: String,
|
||||||
.filter_level(log::LevelFilter::Info)
|
or_input: String,
|
||||||
.init();
|
oc_input: String,
|
||||||
|
}
|
||||||
|
|
||||||
let (input_path, output_path) = {
|
impl Args {
|
||||||
let mut args = env::args().collect::<Vec<_>>();
|
fn populated(&self) -> bool {
|
||||||
|
!(self.output.is_empty() || self.or_input.is_empty() || self.oc_input.is_empty())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if args.len() != 3 {
|
fn usage(binary_name: &str) {
|
||||||
bail(format!(
|
bail(format!(
|
||||||
"usage: {} <input-file> <output-file>",
|
"usage: {} --output <output-file> --or-input <or-input> --oc-input <oc-input>",
|
||||||
args.first().map(String::as_str).unwrap_or("data-import")
|
binary_name
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
(args.remove(1), args.remove(1))
|
fn parse_args() -> Args {
|
||||||
|
let mut args_iter = env::args();
|
||||||
|
let binary_name = args_iter.next().unwrap();
|
||||||
|
|
||||||
|
let mut args = Args {
|
||||||
|
output: "".into(),
|
||||||
|
or_input: env::var("OPENRUSSIAN_DATA").unwrap_or_default(),
|
||||||
|
oc_input: env::var("OPENCORPORA_DATA").unwrap_or_default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
info!("reading from {input_path}; writing output to {output_path}");
|
loop {
|
||||||
let input_file = File::open(input_path).ensure("failed to open input file");
|
if args.populated() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
while let Some(arg) = args_iter.next() {
|
||||||
|
match arg.as_str() {
|
||||||
|
"--output" => {
|
||||||
|
args.output = args_iter.next().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
"--or-input" => {
|
||||||
|
args.or_input = args_iter.next().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
"--oc-input" => {
|
||||||
|
args.oc_input = args_iter.next().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
_ => usage(&binary_name),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.output.is_empty() || args.or_input.is_empty() || args.oc_input.is_empty() {
|
||||||
|
usage(&binary_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
args
|
||||||
|
}
|
||||||
|
|
||||||
|
fn open_corpora(conn: &Connection, args: &Args) {
|
||||||
|
let input_file = File::open(&args.oc_input).ensure("failed to open input file");
|
||||||
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
||||||
|
|
||||||
let conn = Connection::open(output_path).ensure("failed to open DB connection");
|
|
||||||
|
|
||||||
db_setup::initial_oc_schema(&conn);
|
db_setup::initial_oc_schema(&conn);
|
||||||
|
|
||||||
// afterwards:
|
|
||||||
// add actual IDs to grammemes
|
|
||||||
// properly reference keys internally
|
|
||||||
// add foreign key constraint on lemma_grammemes.grammeme
|
|
||||||
|
|
||||||
let mut tx = conn
|
let mut tx = conn
|
||||||
.unchecked_transaction()
|
.unchecked_transaction()
|
||||||
.ensure("failed to start transaction");
|
.ensure("failed to start transaction");
|
||||||
|
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
|
|
||||||
while let Some(elem) = parser.next_element() {
|
while let Some(elem) = parser.next_element() {
|
||||||
|
@ -165,7 +200,46 @@ fn main() {
|
||||||
count += 1;
|
count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
tx.commit().ensure("final commit failed");
|
tx.commit().ensure("final OpenCorpora commit failed");
|
||||||
|
|
||||||
|
info!("finished OpenCorpora import");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn open_russian(conn: &Connection, args: &Args) {
|
||||||
|
let parser = or_parser::OpenRussianParser::new(&args.or_input);
|
||||||
|
|
||||||
|
db_setup::initial_or_schema(conn);
|
||||||
|
|
||||||
|
let tx = conn
|
||||||
|
.unchecked_transaction()
|
||||||
|
.ensure("failed to start transaction");
|
||||||
|
|
||||||
|
db_setup::insert_or_words(&tx, parser.words());
|
||||||
|
tx.commit().ensure("OpenRussian words commit failed");
|
||||||
|
|
||||||
|
info!("finished OpenRussian import");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
env_logger::builder()
|
||||||
|
.filter_level(log::LevelFilter::Info)
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let args = parse_args();
|
||||||
|
|
||||||
|
info!("output path: {}", args.output);
|
||||||
|
info!("OpenCorpora input path: {}", args.oc_input);
|
||||||
|
info!("OpenRussian input path: {}", args.or_input);
|
||||||
|
|
||||||
|
let conn = Connection::open(&args.output).ensure("failed to open DB connection");
|
||||||
|
|
||||||
|
open_corpora(&conn, &args);
|
||||||
|
open_russian(&conn, &args);
|
||||||
|
|
||||||
|
// afterwards:
|
||||||
|
// add actual IDs to grammemes
|
||||||
|
// properly reference keys internally
|
||||||
|
// add foreign key constraint on lemma_grammemes.grammeme
|
||||||
}
|
}
|
||||||
|
|
||||||
/// It's like `expect`, but through `log::error`.
|
/// It's like `expect`, but through `log::error`.
|
||||||
|
|
73
corp/russian/data-import/src/or_parser.rs
Normal file
73
corp/russian/data-import/src/or_parser.rs
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
//! Parser for the OpenRussian data format.
|
||||||
|
//!
|
||||||
|
//! Note that when exporting OpenRussian data from the project you
|
||||||
|
//! have to choose an encoding. We choose tab-separated CSV files, as
|
||||||
|
//! tabs have a very low probability of actually appearing in the
|
||||||
|
//! input data and this skips some potential encoding issues.
|
||||||
|
|
||||||
|
use super::Ensure;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::BufReader;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
/// A word from the `words` table.
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct Word {
|
||||||
|
pub id: usize,
|
||||||
|
pub position: String, // TODO: unknown
|
||||||
|
pub bare: String, // TODO: unknown
|
||||||
|
pub accented: String, // TODO: unknown
|
||||||
|
pub derived_from_word_id: Option<usize>,
|
||||||
|
pub rank: String, // TODO: unknown
|
||||||
|
pub disabled: String, // TODO: unknown
|
||||||
|
pub audio: String, // TODO: unknown
|
||||||
|
pub usage_en: String, // TODO: unknown
|
||||||
|
pub usage_de: String, // TODO: unknown
|
||||||
|
pub number_value: String, // TODO: unknown
|
||||||
|
|
||||||
|
#[serde(rename = "type")]
|
||||||
|
pub word_type: String, // TODO: unknown
|
||||||
|
|
||||||
|
pub level: String, // TODO: unknown
|
||||||
|
pub created_at: String, // TODO: unknown
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct OpenRussianParser {
|
||||||
|
or_directory: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type DynIter<T> = Box<dyn Iterator<Item = T>>;
|
||||||
|
|
||||||
|
impl OpenRussianParser {
|
||||||
|
pub fn new<P: Into<PathBuf>>(path: P) -> Self {
|
||||||
|
OpenRussianParser {
|
||||||
|
or_directory: path.into(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn words(&self) -> DynIter<Word> {
|
||||||
|
self.parser_for("words.csv")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parser_for<T: serde::de::DeserializeOwned + 'static>(
|
||||||
|
&self,
|
||||||
|
file_name: &str,
|
||||||
|
) -> Box<dyn Iterator<Item = T>> {
|
||||||
|
let mut path = self.or_directory.clone();
|
||||||
|
path.push(file_name);
|
||||||
|
|
||||||
|
let reader = csv::ReaderBuilder::new()
|
||||||
|
.delimiter(b'\t')
|
||||||
|
.from_reader(BufReader::new(
|
||||||
|
File::open(&path).ensure("failed to open words.csv"),
|
||||||
|
));
|
||||||
|
|
||||||
|
Box::new(reader.into_deserialize().map(|result| {
|
||||||
|
result.ensure(format!(
|
||||||
|
"failed to deserialize {}",
|
||||||
|
std::any::type_name::<T>()
|
||||||
|
))
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue