blob: 0753194a347481437c74402a7a9a136ed641d842 [file] [log] [blame]
Marc Kupietz4fe7c872023-04-19 13:33:42 +02001.DELETE_ON_ERROR:
2
3all: json
4
Marc Kupietz2477aeb2023-05-24 14:09:58 +02005./output/corpus.p5.xml: eng2tei.py
Marc Kupietz4fe7c872023-04-19 13:33:42 +02006 mkdir -p output
Marc Kupietz5fe47102023-05-23 16:57:22 +02007 python eng2tei.py `find /export/coin/ICC/ICC-EN_written_XML_April2023 -name "*.xml"`
Marc Kupietz4fe7c872023-04-19 13:33:42 +02008
Marc Kupietz2477aeb2023-05-24 14:09:58 +02009icc-eng.p5.xml: ./output/corpus.p5.xml
Marc Kupietz5fe47102023-05-23 16:57:22 +020010 cat $< | xmllint - > $@
Marc Kupietz4fe7c872023-04-19 13:33:42 +020011
Marc Kupietz5fe47102023-05-23 16:57:22 +020012icc-eng.zip: icc-eng.p5.xml
Marc Kupietz4fe7c872023-04-19 13:33:42 +020013 pv $< | tei2korapxml -s -tk - > $@
14
Marc Kupietz5fe47102023-05-23 16:57:22 +020015icc-eng.ud.zip: icc-eng.zip
16 korapxml2conllu $< | pv | /usr/local/kl/bin/udpipe2 -r -m english-partut-ud-2.10-220711 | conllu2korapxml > $@
Marc Kupietz4fe7c872023-04-19 13:33:42 +020017
18
Marc Kupietz5fe47102023-05-23 16:57:22 +020019icc-eng.krill.tar: icc-eng.zip icc-eng.ud.zip
Marc Kupietz4b443992023-05-24 14:10:30 +020020 korapxml2krill archive -w -cfg /vol/corpora/ICC/icc-eng.cfg -j 0 --meta ICC -i icc-eng.zip -i icc-eng.ud.zip -o icc-eng.krill
Marc Kupietz4fe7c872023-04-19 13:33:42 +020021
Marc Kupietz5fe47102023-05-23 16:57:22 +020022json: icc-eng.krill.tar
Marc Kupietz4fe7c872023-04-19 13:33:42 +020023 rm -rf json
24 mkdir -p json
Marc Kupietz5fe47102023-05-23 16:57:22 +020025 tar -C json -xf icc-eng.krill.tar
26 rsync -avz --delete json korap@korap-worker-07:/opt/korap/icc/eng/KorAP-Docker/
27 ssh korap@korap-worker-07 cd /opt/korap/icc/eng/KorAP-Docker && rm -rf index && mkdir -p index && docker run -u root --rm -v /opt/korap/icc/eng/KorAP-Docker:/data:z korap/kustvakt Krill-Indexer.jar -c /kustvakt/kustvakt-lite.conf -i /data/json -o /data/index/