feat(corp/russian/data-import): new OpenCorpora data import tool
Adds the beginning of a tool which can import OpenCorpora data into a SQLite database. This is quite a lot of toil and there's probably a better way to do this, but overall becoming this intimately familiar with the data structures is quite helpful for understanding what I can/can't do with only this dataset. Change-Id: Ieab33a8ce07ea4ac87917b9c8132226bbc6523b1 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7859 Reviewed-by: tazjin <tazjin@tvl.su> Tested-by: BuildkiteCI
This commit is contained in:
parent
032ab16bbb
commit
ee7616d956
6 changed files with 829 additions and 0 deletions
39
corp/russian/data-import/default.nix
Normal file
39
corp/russian/data-import/default.nix
Normal file
|
@ -0,0 +1,39 @@
|
|||
{ depot, pkgs, ... }:
|
||||
|
||||
let
|
||||
buildInputs = with pkgs; [
|
||||
sqlite
|
||||
pkg-config
|
||||
];
|
||||
|
||||
# mirrored input data from OpenCorpora, as of 2023-01-17.
|
||||
#
|
||||
# This data is licensed under CC-BY-SA.
|
||||
inputDataArchive = pkgs.fetchurl {
|
||||
name = "dict.opcorpora.xml.bz";
|
||||
url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2";
|
||||
sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6";
|
||||
};
|
||||
|
||||
inputData = pkgs.runCommand "dict.opcorpora.xml" { } ''
|
||||
${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out
|
||||
'';
|
||||
|
||||
# development shell with native deps
|
||||
shell = pkgs.mkShell {
|
||||
inherit buildInputs;
|
||||
|
||||
# make OPENCORPORA_DATA available in the environment
|
||||
OPENCORPORA_DATA = inputData;
|
||||
};
|
||||
in
|
||||
depot.third_party.naersk.buildPackage {
|
||||
src = depot.third_party.gitignoreSource ./.;
|
||||
inherit buildInputs;
|
||||
|
||||
passthru = {
|
||||
inherit shell;
|
||||
|
||||
|
||||
};
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue