Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 1 | .DELETE_ON_ERROR: |
| 2 | |
| 3 | all: json |
| 4 | |
Marc Kupietz | 2477aeb | 2023-05-24 14:09:58 +0200 | [diff] [blame] | 5 | ./output/corpus.p5.xml: eng2tei.py |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 6 | mkdir -p output |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 7 | python eng2tei.py `find /export/coin/ICC/ICC-EN_written_XML_April2023 -name "*.xml"` |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 8 | |
Marc Kupietz | 2477aeb | 2023-05-24 14:09:58 +0200 | [diff] [blame] | 9 | icc-eng.p5.xml: ./output/corpus.p5.xml |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 10 | cat $< | xmllint - > $@ |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 11 | |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 12 | icc-eng.zip: icc-eng.p5.xml |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 13 | pv $< | tei2korapxml -s -tk - > $@ |
| 14 | |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 15 | icc-eng.ud.zip: icc-eng.zip |
| 16 | korapxml2conllu $< | pv | /usr/local/kl/bin/udpipe2 -r -m english-partut-ud-2.10-220711 | conllu2korapxml > $@ |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 17 | |
| 18 | |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 19 | icc-eng.krill.tar: icc-eng.zip icc-eng.ud.zip |
Marc Kupietz | 4b44399 | 2023-05-24 14:10:30 +0200 | [diff] [blame] | 20 | korapxml2krill archive -w -cfg /vol/corpora/ICC/icc-eng.cfg -j 0 --meta ICC -i icc-eng.zip -i icc-eng.ud.zip -o icc-eng.krill |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 21 | |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 22 | json: icc-eng.krill.tar |
Marc Kupietz | 4fe7c87 | 2023-04-19 13:33:42 +0200 | [diff] [blame] | 23 | rm -rf json |
| 24 | mkdir -p json |
Marc Kupietz | 5fe4710 | 2023-05-23 16:57:22 +0200 | [diff] [blame] | 25 | tar -C json -xf icc-eng.krill.tar |
Marc Kupietz | 88319bf | 2023-05-24 14:14:48 +0200 | [diff] [blame] | 26 | rsync -avz --delete json korap@korap-worker-07:/opt/korap/icc/eng/ |
| 27 | ssh korap@korap-worker-07 "cd /opt/korap/icc/eng && rm -rf index && mkdir -p index && docker run -u root --rm -v /opt/korap/icc/eng:/data:z korap/kustvakt Krill-Indexer.jar -c /kustvakt/kustvakt-lite.conf -i /data/json -o /data/index/ && INDEX=./index docker-compose --profile=full -p icc-eng restart" |