From ee0c0ee95103fa10e227a1976149d20e6944001c Mon Sep 17 00:00:00 2001 From: Vincent Ambo Date: Fri, 20 Jan 2023 11:54:38 +0300 Subject: [PATCH] chore(corp/data-import): make OR data archive available in env Change-Id: Idacf42743051eae0cf7010f952a4f91af17ad708 Reviewed-on: https://cl.tvl.fyi/c/depot/+/7892 Reviewed-by: tazjin Tested-by: BuildkiteCI --- corp/russian/data-import/default.nix | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/corp/russian/data-import/default.nix b/corp/russian/data-import/default.nix index c2fc1bf1c..cf358874d 100644 --- a/corp/russian/data-import/default.nix +++ b/corp/russian/data-import/default.nix @@ -9,22 +9,29 @@ let # mirrored input data from OpenCorpora, as of 2023-01-17. # # This data is licensed under CC-BY-SA. - inputDataArchive = pkgs.fetchurl { + openCorporaArchive = pkgs.fetchurl { name = "dict.opcorpora.xml.bz"; - url = "https://tazj.in/blobs/dict.opcorpora.xml.bz2"; + url = "https://tazj.in/blobs/opencorpora-20230117.xml.bz2"; sha256 = "04n5g43fkfc93z6xlwf2qfdrfdfl562pc2ivdb3cbbbsy56gkqg6"; }; - inputData = pkgs.runCommand "dict.opcorpora.xml" { } '' - ${pkgs.bzip2}/bin/bunzip2 -k -c ${inputDataArchive} > $out + openCorpora = pkgs.runCommand "dict.opcorpora.xml" { } '' + ${pkgs.bzip2}/bin/bunzip2 -k -c ${openCorporaArchive} > $out ''; + openRussianArchive = pkgs.fetchzip { + name = "openrussian-20230117"; + url = "https://tazj.in/blobs/openrussian-20230117.tar.xz"; + sha256 = "06jl7i23cx58a0n2626hb82xlzimixvnxp7lxdw0g664kv9bmw25"; + }; + # development shell with native deps shell = pkgs.mkShell { inherit buildInputs; - # make OPENCORPORA_DATA available in the environment - OPENCORPORA_DATA = inputData; + # make datasets available in the environment + OPENCORPORA_DATA = openCorpora; + OPENRUSSIAN_DATA = openRussianArchive; }; in @@ -33,11 +40,11 @@ lib.fix (self: depot.third_party.naersk.buildPackage { inherit buildInputs; passthru = depot.nix.readTree.drvTargets { - inherit shell inputData; + inherit shell openCorpora; # target that actually builds an entire database database = pkgs.runCommand "tvl-russian-db.sqlite" { } '' - ${self}/bin/data-import ${inputData} $out + ${self}/bin/data-import ${openCorpora} $out ''; }; })