feat(corp/russian/data-import): new OpenCorpora data import tool
Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
032ab16bbb
commit
ee7616d956
6 changed files with 829 additions and 0 deletions
2
corp/russian/data-import/.gitignore
vendored
Normal file
2
corp/russian/data-import/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
target/
|
||||||
|
all_events.txt
|
384
corp/russian/data-import/Cargo.lock
generated
Normal file
384
corp/russian/data-import/Cargo.lock
generated
Normal file
|
@ -0,0 +1,384 @@
|
||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ahash"
|
||||||
|
version = "0.7.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom",
|
||||||
|
"once_cell",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "0.7.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "1.3.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cc"
|
||||||
|
version = "1.0.78"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a20104e2335ce8a659d6dd92a51a767a0c062599c73b343fd152cb401e828c3d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "cfg-if"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "data-import"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"env_logger",
|
||||||
|
"log",
|
||||||
|
"rusqlite",
|
||||||
|
"xml-rs",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_logger"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||||
|
dependencies = [
|
||||||
|
"humantime",
|
||||||
|
"is-terminal",
|
||||||
|
"log",
|
||||||
|
"regex",
|
||||||
|
"termcolor",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno"
|
||||||
|
version = "0.2.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
|
||||||
|
dependencies = [
|
||||||
|
"errno-dragonfly",
|
||||||
|
"libc",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno-dragonfly"
|
||||||
|
version = "0.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||||
|
dependencies = [
|
||||||
|
"cc",
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fallible-iterator"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fallible-streaming-iterator"
|
||||||
|
version = "0.1.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "getrandom"
|
||||||
|
version = "0.2.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"libc",
|
||||||
|
"wasi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashbrown"
|
||||||
|
version = "0.12.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||||
|
dependencies = [
|
||||||
|
"ahash",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hashlink"
|
||||||
|
version = "0.8.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "69fe1fcf8b4278d860ad0548329f892a3631fb63f82574df68275f34cdbe0ffa"
|
||||||
|
dependencies = [
|
||||||
|
"hashbrown",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hermit-abi"
|
||||||
|
version = "0.2.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "humantime"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "io-lifetimes"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is-terminal"
|
||||||
|
version = "0.4.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "28dfb6c8100ccc63462345b67d1bbc3679177c75ee4bf59bf29c8b1d110b8189"
|
||||||
|
dependencies = [
|
||||||
|
"hermit-abi",
|
||||||
|
"io-lifetimes",
|
||||||
|
"rustix",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libc"
|
||||||
|
version = "0.2.139"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "201de327520df007757c1f0adce6e827fe8562fbc28bfd9c15571c66ca1f5f79"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "libsqlite3-sys"
|
||||||
|
version = "0.25.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "29f835d03d717946d28b1d1ed632eb6f0e24a299388ee623d0c23118d3e8a7fa"
|
||||||
|
dependencies = [
|
||||||
|
"pkg-config",
|
||||||
|
"vcpkg",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "linux-raw-sys"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "log"
|
||||||
|
version = "0.4.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "once_cell"
|
||||||
|
version = "1.17.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pkg-config"
|
||||||
|
version = "0.3.26"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.7.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "48aaa5748ba571fb95cd2c85c09f629215d3a6ece942baa100950af03a34f733"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.6.28"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "456c603be3e8d448b072f410900c09faf164fbce2d480456f50eea6e25f9c848"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rusqlite"
|
||||||
|
version = "0.28.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "01e213bc3ecb39ac32e81e51ebe31fd888a940515173e3a18a35f8c6e896422a"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"fallible-iterator",
|
||||||
|
"fallible-streaming-iterator",
|
||||||
|
"hashlink",
|
||||||
|
"libsqlite3-sys",
|
||||||
|
"smallvec",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustix"
|
||||||
|
version = "0.36.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"errno",
|
||||||
|
"io-lifetimes",
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "1.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "termcolor"
|
||||||
|
version = "1.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "vcpkg"
|
||||||
|
version = "0.2.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "version_check"
|
||||||
|
version = "0.9.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasi"
|
||||||
|
version = "0.11.0+wasi-snapshot-preview1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi"
|
||||||
|
version = "0.3.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-i686-pc-windows-gnu",
|
||||||
|
"winapi-x86_64-pc-windows-gnu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-i686-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-x86_64-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.42.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5a3e1820f08b8513f676f7ab6c1f99ff312fb97b553d30ff4dd86f9f15728aa7"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm",
|
||||||
|
"windows_aarch64_msvc",
|
||||||
|
"windows_i686_gnu",
|
||||||
|
"windows_i686_msvc",
|
||||||
|
"windows_x86_64_gnu",
|
||||||
|
"windows_x86_64_gnullvm",
|
||||||
|
"windows_x86_64_msvc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8c9864e83243fdec7fc9c5444389dcbbfd258f745e7853198f365e3c4968a608"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4c8b1b673ffc16c47a9ff48570a9d85e25d265735c503681332589af6253c6c7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "de3887528ad530ba7bdbb1faa8275ec7a1155a45ffa57c37993960277145d640"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bf4d1122317eddd6ff351aa852118a2418ad4214e6613a50e0191f7004372605"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c1040f221285e17ebccbc2591ffdc2d44ee1f9186324dd3e84e99ac68d699c45"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "628bfdf232daa22b0d64fdb62b09fcc36bb01f05a3939e20ab73aaf9470d0463"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.42.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "447660ad36a13288b1db4d4248e857b510e8c3a225c822ba4fb748c0aafecffd"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "xml-rs"
|
||||||
|
version = "0.8.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3"
|
16
corp/russian/data-import/Cargo.toml
Normal file
16
corp/russian/data-import/Cargo.toml
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
[package]
|
||||||
|
name = "data-import"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
env_logger = "0.10.0"
|
||||||
|
log = "0.4.17"
|
||||||
|
rusqlite = "0.28"
|
||||||
|
xml-rs = "0.8"
|
||||||
|
|
||||||
|
[profile.release-with-debug]
|
||||||
|
inherits = "release"
|
||||||
|
debug = true
|
39
corp/russian/data-import/default.nix
Normal file
39
corp/russian/data-import/default.nix
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{ depot, pkgs, ... }:
|
||||||
|
|
||||||
|
let
|
||||||
|
buildInputs = with pkgs; [
|
||||||
|
sqlite
|
||||||
|
pkg-config
|
||||||
|
];
|
||||||
|
|
||||||
|
# mirrored input data from OpenCorpora, as of 2023-01-17.
|
||||||
|
#
|
||||||
|
# This data is licensed under CC-BY-SA.
|
||||||
|
inputDataArchive = pkgs.fetchurl {
|
||||||
|
name = "dict.opcorpora.xml.bz";
|
||||||
|
url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2";
|
||||||
|
sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6";
|
||||||
|
};
|
||||||
|
|
||||||
|
inputData = pkgs.runCommand "dict.opcorpora.xml" { } ''
|
||||||
|
${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out
|
||||||
|
'';
|
||||||
|
|
||||||
|
# development shell with native deps
|
||||||
|
shell = pkgs.mkShell {
|
||||||
|
inherit buildInputs;
|
||||||
|
|
||||||
|
# make OPENCORPORA_DATA available in the environment
|
||||||
|
OPENCORPORA_DATA = inputData;
|
||||||
|
};
|
||||||
|
in
|
||||||
|
depot.third_party.naersk.buildPackage {
|
||||||
|
src = depot.third_party.gitignoreSource ./.;
|
||||||
|
inherit buildInputs;
|
||||||
|
|
||||||
|
passthru = {
|
||||||
|
inherit shell;
|
||||||
|
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
126
corp/russian/data-import/src/main.rs
Normal file
126
corp/russian/data-import/src/main.rs
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
//! This program imports Russian language data from OpenCorpora
|
||||||
|
//! ("Открытый корпус") into a SQLite database that can be used for
|
||||||
|
//! [//corp/russian][corp-russian] projects.
|
||||||
|
//!
|
||||||
|
//! [corp-russian]: https://at.tvl.fyi/?q=%2F%2Fcorp%2Frussian
|
||||||
|
//!
|
||||||
|
//! Ideally, running this on an OpenCorpora dump should yield a fully
|
||||||
|
//! functional SQLite database compatible with all other tools
|
||||||
|
//! consuming it.
|
||||||
|
//!
|
||||||
|
//! ## OpenCorpora format
|
||||||
|
//!
|
||||||
|
//! The format used is partially documented on the [OpenCorpora
|
||||||
|
//! website][format-docs]. This seems to be a slightly outdated
|
||||||
|
//! format, however, hence some information about what the format
|
||||||
|
//! seems to be today.
|
||||||
|
//!
|
||||||
|
//! [format-docs]: http://opencorpora.org/?page=export
|
||||||
|
//!
|
||||||
|
//! The format is an XML file, which has several categories of data,
|
||||||
|
//! each with their own schema:
|
||||||
|
//!
|
||||||
|
//! * `grammemes`: These define units of grammar. They're *likely* pretty
|
||||||
|
//! static, and we'll *likely* want to map them into a custom set of
|
||||||
|
//! (simpler) categories.
|
||||||
|
//!
|
||||||
|
//! They form some kind of internal hierarchy, where some of them have a
|
||||||
|
//! `parent` attribute set to some other grammemes `name`.
|
||||||
|
//!
|
||||||
|
//! There's a ridiculous number of these.
|
||||||
|
//!
|
||||||
|
//! * `restrictions`: Unclear, not documented on the page. They describe
|
||||||
|
//! something about the relationship between grammemes.
|
||||||
|
//!
|
||||||
|
//! * `lemmata`: this lists the actual lemmas, as well as all their
|
||||||
|
//! included morphological variants
|
||||||
|
//!
|
||||||
|
//! Each lemma has an `id` attribute uniquely identifying its dictionary
|
||||||
|
//! form, as well as a number of sub-elements:
|
||||||
|
//!
|
||||||
|
//! * the `l` attribute contains the lemma itself
|
||||||
|
//! * the `f` attributes contain morphological variations
|
||||||
|
//!
|
||||||
|
//! Each of these sub elements again contains a number of `g` elements,
|
||||||
|
//! which refer to the IDs of grammems in their `v` attributes.
|
||||||
|
//!
|
||||||
|
//! * `<link_types>` These list possible "relationships between lemmas",
|
||||||
|
//! basically just assigning them IDs and names. There's only 27 of
|
||||||
|
//! these.
|
||||||
|
//!
|
||||||
|
//! * `<links>`: Using the types defined above, this establishes links
|
||||||
|
//! between lemmas that have some kind of relationship.
|
||||||
|
//!
|
||||||
|
//! For example, a relationship `cardinal/ordinal` might be established
|
||||||
|
//! between the lemmas "два" and "второй".
|
||||||
|
|
||||||
|
use log::{error, info};
|
||||||
|
use std::env;
|
||||||
|
use std::fmt::Display;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufReader, BufWriter, Write};
|
||||||
|
|
||||||
|
mod oc_parser;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
env_logger::builder()
|
||||||
|
.filter_level(log::LevelFilter::Info)
|
||||||
|
.init();
|
||||||
|
|
||||||
|
let input_path = env::args()
|
||||||
|
.skip(1)
|
||||||
|
.next()
|
||||||
|
.ensure("must specify the input filename as the only argument");
|
||||||
|
|
||||||
|
info!("reading from {input_path}");
|
||||||
|
let input_file = File::open(input_path).ensure("failed to open input file");
|
||||||
|
|
||||||
|
let mut parser = oc_parser::OpenCorporaParser::new(BufReader::new(input_file));
|
||||||
|
|
||||||
|
let mut out = BufWriter::new(std::io::stdout().lock());
|
||||||
|
|
||||||
|
while let Some(elem) = parser.next_element() {
|
||||||
|
match elem {
|
||||||
|
oc_parser::OcElement::Grammeme(g) => {
|
||||||
|
writeln!(out, "{:?}", g).ensure("writing element failed")
|
||||||
|
}
|
||||||
|
oc_parser::OcElement::Lemma(_) => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
out.flush().ensure("flushing the out buffer failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// It's like `expect`, but through `log::error`.
|
||||||
|
trait Ensure<T> {
|
||||||
|
fn ensure<S: Into<String>>(self, msg: S) -> T;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T, E: Display> Ensure<T> for Result<T, E> {
|
||||||
|
fn ensure<S: Into<String>>(self, msg: S) -> T {
|
||||||
|
match self {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(err) => {
|
||||||
|
error!("{}: {}", msg.into(), err);
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Ensure<T> for Option<T> {
|
||||||
|
fn ensure<S: Into<String>>(self, msg: S) -> T {
|
||||||
|
match self {
|
||||||
|
Some(x) => x,
|
||||||
|
None => {
|
||||||
|
error!("{}", msg.into());
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn bail<S: Into<String>>(msg: S) -> ! {
|
||||||
|
error!("{}", msg.into());
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
262
corp/russian/data-import/src/oc_parser.rs
Normal file
262
corp/russian/data-import/src/oc_parser.rs
Normal file
|
@ -0,0 +1,262 @@
|
||||||
|
use super::{bail, Ensure};
|
||||||
|
use log::info;
|
||||||
|
use xml::attribute::OwnedAttribute;
|
||||||
|
use xml::name::OwnedName;
|
||||||
|
use xml::reader::XmlEvent;
|
||||||
|
use xml::EventReader;
|
||||||
|
|
||||||
|
#[derive(Default, Debug)]
|
||||||
|
pub struct Grammeme {
|
||||||
|
parent: Option<String>,
|
||||||
|
name: String,
|
||||||
|
alias: String,
|
||||||
|
description: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct Lemma {}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub enum OcElement {
|
||||||
|
Grammeme(Grammeme),
|
||||||
|
Lemma(Lemma),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq)]
|
||||||
|
enum ParserState {
|
||||||
|
/// Parser is not parsing any particular section and waiting for a
|
||||||
|
/// start tag instead.
|
||||||
|
Init,
|
||||||
|
|
||||||
|
/// Parser is parsing grammemes.
|
||||||
|
Grammemes,
|
||||||
|
|
||||||
|
/// Parser is parsing lemmata.
|
||||||
|
Lemmata,
|
||||||
|
|
||||||
|
/// Parser has seen the end of the line and nothing more is
|
||||||
|
/// available.
|
||||||
|
Ended,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct OpenCorporaParser<R: std::io::Read> {
|
||||||
|
reader: EventReader<R>,
|
||||||
|
state: ParserState,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(PartialEq)]
|
||||||
|
enum SectionState {
|
||||||
|
/// Actively interested in parsing this section.
|
||||||
|
Active,
|
||||||
|
|
||||||
|
/// Section is known, but currently ignored.
|
||||||
|
Inactive,
|
||||||
|
|
||||||
|
/// Section is unknown (probably a bug).
|
||||||
|
Unknown,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn section_state(section: &str) -> SectionState {
|
||||||
|
match section {
|
||||||
|
"grammemes" | "lemmata" => SectionState::Active,
|
||||||
|
"restrictions" | "link_types" | "links" => SectionState::Inactive,
|
||||||
|
_ => SectionState::Unknown,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<R: std::io::Read> OpenCorporaParser<R> {
|
||||||
|
pub fn new(reader: R) -> Self {
|
||||||
|
let config = xml::ParserConfig::new().trim_whitespace(true);
|
||||||
|
let reader = EventReader::new_with_config(reader, config);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
reader,
|
||||||
|
state: ParserState::Init,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Pull an `OcElement` out of the parser. Returns `None` if the
|
||||||
|
/// parser stream has ended.
|
||||||
|
pub fn next_element(&mut self) -> Option<OcElement> {
|
||||||
|
if self.state == ParserState::Ended {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pull the next element to determine what context to enter
|
||||||
|
// next.
|
||||||
|
loop {
|
||||||
|
match &self.next() {
|
||||||
|
// no-op events that do not affect parser state
|
||||||
|
XmlEvent::Comment(_)
|
||||||
|
| XmlEvent::Whitespace(_)
|
||||||
|
| XmlEvent::ProcessingInstruction { .. }
|
||||||
|
| XmlEvent::StartDocument { .. } => continue,
|
||||||
|
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
|
||||||
|
if name.local_name == "dictionary" =>
|
||||||
|
{
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// end of the file, nothing more to return
|
||||||
|
XmlEvent::EndDocument => {
|
||||||
|
self.state = ParserState::Ended;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// some sections are skipped
|
||||||
|
XmlEvent::StartElement { name, .. } | XmlEvent::EndElement { name }
|
||||||
|
if section_state(&name.local_name) == SectionState::Inactive =>
|
||||||
|
{
|
||||||
|
info!("skipping {} section", name.local_name);
|
||||||
|
self.skip_section(&name.local_name);
|
||||||
|
}
|
||||||
|
|
||||||
|
// active section events start specific parser states ...
|
||||||
|
XmlEvent::StartElement { name, .. }
|
||||||
|
if section_state(&name.local_name) == SectionState::Active =>
|
||||||
|
{
|
||||||
|
self.state = match name.local_name.as_str() {
|
||||||
|
"grammemes" => ParserState::Grammemes,
|
||||||
|
"lemmata" => ParserState::Lemmata,
|
||||||
|
_ => unreachable!(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... or end them
|
||||||
|
XmlEvent::EndElement { name, .. }
|
||||||
|
if section_state(&name.local_name) == SectionState::Active =>
|
||||||
|
{
|
||||||
|
// TODO: assert that the right section ended
|
||||||
|
self.state = ParserState::Init;
|
||||||
|
}
|
||||||
|
|
||||||
|
// actual beginning of an actual element, dispatch accordingly
|
||||||
|
event @ XmlEvent::StartElement {
|
||||||
|
name, attributes, ..
|
||||||
|
} => match self.state {
|
||||||
|
ParserState::Grammemes => {
|
||||||
|
return Some(OcElement::Grammeme(self.parse_grammeme(name, attributes)))
|
||||||
|
}
|
||||||
|
ParserState::Lemmata => {
|
||||||
|
return Some(OcElement::Lemma(self.parse_lemma(name, attributes)))
|
||||||
|
}
|
||||||
|
|
||||||
|
ParserState::Init | ParserState::Ended => bail(format!(
|
||||||
|
"parser received an unexpected start element while in state {:?}: {:?}",
|
||||||
|
self.state, event
|
||||||
|
)),
|
||||||
|
},
|
||||||
|
|
||||||
|
// finally, events that indicate a bug if they're
|
||||||
|
// encountered here
|
||||||
|
event @ XmlEvent::EndElement { .. }
|
||||||
|
| event @ XmlEvent::CData(_)
|
||||||
|
| event @ XmlEvent::Characters(_) => {
|
||||||
|
bail(format!("unexpected XML event: {:?}", event))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Skip a section by advancing the parser state until we see an
|
||||||
|
/// end element for the skipped section.
|
||||||
|
fn skip_section(&mut self, section: &str) {
|
||||||
|
loop {
|
||||||
|
match self.next() {
|
||||||
|
XmlEvent::EndElement { name } if name.local_name == section => return,
|
||||||
|
_ => continue,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next(&mut self) -> XmlEvent {
|
||||||
|
self.reader.next().ensure("XML parsing failed")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse a tag that should have plain string content.
|
||||||
|
fn parse_string(&mut self, tag_name: &str) -> String {
|
||||||
|
let mut out = String::new();
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match self.next() {
|
||||||
|
// ignore irrelevant things
|
||||||
|
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
|
||||||
|
|
||||||
|
// set the content
|
||||||
|
XmlEvent::Characters(content) => {
|
||||||
|
out = content;
|
||||||
|
}
|
||||||
|
|
||||||
|
// expect the end of the element
|
||||||
|
XmlEvent::EndElement { name } if name.local_name == tag_name => return out,
|
||||||
|
|
||||||
|
// fail on everything unexpected
|
||||||
|
event => bail(format!(
|
||||||
|
"unexpected element while parsing <{}>: {:?}",
|
||||||
|
tag_name, event
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_grammeme(&mut self, name: &OwnedName, attributes: &[OwnedAttribute]) -> Grammeme {
|
||||||
|
if name.local_name != "grammeme" {
|
||||||
|
bail(format!(
|
||||||
|
"expected to parse a grammeme, but found <{}>",
|
||||||
|
name.local_name
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut grammeme = Grammeme::default();
|
||||||
|
|
||||||
|
for attr in attributes {
|
||||||
|
if attr.name.local_name == "parent" && !attr.value.is_empty() {
|
||||||
|
grammeme.parent = Some(attr.value.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match self.next() {
|
||||||
|
// ignore irrelevant things
|
||||||
|
XmlEvent::Comment(_) | XmlEvent::Whitespace(_) => continue,
|
||||||
|
|
||||||
|
// expect known tags
|
||||||
|
XmlEvent::StartElement { name, .. } if name.local_name == "name" => {
|
||||||
|
grammeme.name = self.parse_string("name");
|
||||||
|
}
|
||||||
|
|
||||||
|
XmlEvent::StartElement { name, .. } if name.local_name == "alias" => {
|
||||||
|
grammeme.alias = self.parse_string("alias");
|
||||||
|
}
|
||||||
|
|
||||||
|
XmlEvent::StartElement { name, .. } if name.local_name == "description" => {
|
||||||
|
grammeme.description = self.parse_string("description");
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle end of the grammeme
|
||||||
|
XmlEvent::EndElement { name } if name.local_name == "grammeme" => break,
|
||||||
|
|
||||||
|
// fail on everything unexpected
|
||||||
|
event => bail(format!(
|
||||||
|
"unexpected element while parsing <grammeme>: {:?}",
|
||||||
|
event
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
grammeme
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_lemma(&mut self, name: &OwnedName, _attributes: &[OwnedAttribute]) -> Lemma {
|
||||||
|
if name.local_name != "lemma" {
|
||||||
|
bail(format!(
|
||||||
|
"expected to parse a lemma, but found <{}>",
|
||||||
|
name.local_name
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
self.skip_section("lemma");
|
||||||
|
|
||||||
|
Lemma {}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue