Update to modern korapxmltool
Change-Id: I1fbc2135ac1eac212e076b2da6e2aa581623e379
diff --git a/Makefile b/Makefile
index 996b372..8ce1cab 100644
--- a/Makefile
+++ b/Makefile
@@ -16,9 +16,12 @@
MAKE ?= make -j $(shell nproc)
SLACK ?= slack
KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
-KORAPXMLTOOL ?= java -Xmx$(KORAPXMLTOOL_HEAP)m -jar lib/korapxmltool.jar
-MARMOTMALTOOL ?= java -Xmx96000m -jar lib/korapxmltool.jar
-SPACYXMLTOOL ?= java -Xmx250g -jar lib/korapxmltool.jar
+KORAPXMLTOOL ?= bin/korapxmltool
+MARMOTMALTOOL ?= bin/korapxmltool
+SPACYXMLTOOL ?= bin/korapxmltool
+KORAPXMLTOOL_MODELS_PATH ?= models
+export KORAPXMLTOOL_MODELS_PATH
+
SAXON ?= java -Djava.util.logging.config.file=/logging.properties -cp lib/saxon-ee-12.5.jar:lib/xmlresolver-5.2.2.jar:lib/textclassifier.jar:lib/xmlresolver-5.2.2-data.jar net.sf.saxon.Transform -expand:off -catalog:"lib/dtds/xhtml11/xhtmlcatalog.xml;lib/dtds/xhtml/dtd/xhtmlcatalog.xml"
.DELETE_ON_ERROR:
@@ -26,7 +29,7 @@
.PHONY: all clean test i5 i5valid krill malt index deploy show-server-log show-server-status
-.PRECIOUS: $(TARGET_DIR)/%.i5.xml $(TARGET_DIR)/dnb%.pre.i5.xml %.zip %.tree_tagger.zip %.ud.zip %.marmot-malt.zip %.spacy.zip %.i5.xml %.tar
+.PRECIOUS: $(TARGET_DIR)/%.i5.xml $(TARGET_DIR)/dnb%.pre.i5.xml %.zip %.tree_tagger.zip %.ud.zip %.marmot-malt.zip %.spacy.zip %.corenlp.zip %.i5.xml %.tar
all: index
@@ -84,11 +87,11 @@
%.tree_tagger.zip: %.zip
- $(KORAPXMLTOOL) -T 1 -A "docker run --rm -i korap/conllu2treetagger -l german" -f zip --overwrite $<
+ $(KORAPXMLTOOL) -A "docker run -v ./models/:/local/models--rm -i korap/conllu-treetagger -l german -p" -t zip -f -D $(TARGET_DIR) $<
# $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
%.spacy.zip: %.zip
- $(SPACYXMLTOOL) -T 8 -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest" -f zip --overwrite $<
+ $(SPACYXMLTOOL) -A "docker run -e SPACY_USE_DEPENDENCIES=True --rm -i korap/conllu2spacy:latest" -t zip -f -D $(TARGET_DIR) $<
models/de.marmot:
mkdir -p models
@@ -102,18 +105,30 @@
mkdir -p models
curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
+models/german-fast.tagger:
+ mkdir -p models
+ curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
+
+models/germanSR.ser.gz:
+ mkdir -p models
+ curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
+
%.marmot-malt.zip: %.zip models/de.marmot models/german.mco
- $(MARMOTMALTOOL) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco -f zip --overwrite $<
+ $(MARMOTMALTOOL) -T marmot -P malt -t zip -f -D $(TARGET_DIR) $<
+
+%.corenlp.zip: %.zip models/de.marmot models/german.mco
+ $(MARMOTMALTOOL) -T corenlp -P corenlp -t zip -f -D $(TARGET_DIR) $<
malt: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).marmot-malt.zip)
%.ud.zip: %.zip
$(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
-%.krill.tar: %.zip %.marmot-malt.zip %.tree_tagger.zip
+%.krill.tar: %.zip %.marmot-malt.zip %.tree_tagger.zip %.spacy.zip %.corenlp.zip
mkdir -p ${BUILD_DIR}/krill/$(basename $@)
mkdir -p $(basename $@)
- K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg -c ${BUILD_DIR}/krill/$(basename $@)/korapxml2krill.cache -j 30 -te ${BUILD_DIR}/krill/$(basename $@) --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+ #K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 korapxml2krill archive -w -z -cfg krill-korap4dnb.cfg -c ${BUILD_DIR}/krill/$(basename $@)/korapxml2krill.cache -j 30 -te ${BUILD_DIR}/krill/$(basename $@) --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -i $(word 4,$^) -o $(basename $@)
+ K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 $(KORAPXMLTOOL) --non-word-tokens -linfo -f -t krill -D $(TARGET_DIR) $(basename $<)*.zip
$(SLACK) "$(basename $@) krill archive created"
%.json: %.krill.tar
@@ -144,4 +159,3 @@
$(TARGET_DIR)/dnb.index: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).krill.tar)
rm -rf $@
java -jar lib/Krill-Indexer.jar --progress -c lib/krill.conf -i $(subst " ",;,$^) -o $@
-