Makefile: make sure publisher is indexed
diff --git a/Makefile b/Makefile
index 1b1c62b..d6ffca4 100644
--- a/Makefile
+++ b/Makefile
@@ -3,8 +3,8 @@
YEARS=13 18 23
else
SRC_DIR ?= ./DeLiKo@DNB
-#YEARS ?= $(shell seq -w 2012 2024 | sed 's/^.*\([0-9][0-9]\)/\1/')
-YEARS ?= $(shell seq -w 2005 2024 | sed 's/^.*\([0-9][0-9]\)/\1/')
+YEARS ?= $(shell seq -w 2024 -1 2005 | sed 's/^.*\([0-9][0-9]\)/\1/')
+#YEARS ?= $(shell seq -w 2005 2023 | sed 's/^.*\([0-9][0-9]\)/\1/')
endif
BUILD_DIR = build
@@ -12,7 +12,7 @@
DEPLOY_HOST ?= compute.ids-mannheim.de
DEPLOY_USER ?= korap
DEPLOY_PATH ?= /export/netapp/korap4dnb
-MAX_THREADS ?= $(shell nproc)
+MAX_THREADS ?= 4 # $(shell nproc)
MAKE ?= make -j $(shell nproc)
KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
KORAPXMLTOOL ?= java -Xmx$(KORAPXMLTOOL_HEAP)m -jar lib/korapxmltool.jar
@@ -32,6 +32,7 @@
index: $(TARGET_DIR)/dnb.index
+# EPUBS := ($shell cat .epubs_cache)
EPUBS := $(wildcard $(SRC_DIR)/**/*.epub)
$(TARGET_DIR)/dnb%.i5.xml: $(TARGET_DIR)/dnb%.pre.i5.xml xslt/pass2.xsl xslt/pass3.xsl models/dereko_domains_s.classifier
@@ -75,7 +76,10 @@
%.zip: %.i5.xml
- tei2korapxml -l warn -s -tk - < $< > $@
+ tei2korapxml -l warn -s -tk - < $< > $@ || (slack "$@ failed" && false)
+ printf "%s\t%s\n" "$(grep -c '<idsText ' $<)" "$(unzip -l $@ | grep data.xml | wc -l)"
+ slack "$@ created"
+
%.tree_tagger.zip: %.zip
$(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
@@ -106,13 +110,15 @@
%.krill.tar: %.zip %.marmot-malt.zip %.tree_tagger.zip
mkdir -p ${BUILD_DIR}/krill/$(basename $@)
mkdir -p $(basename $@)
- K2K_TRANSLATOR_TEXT=1 korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg -c ${BUILD_DIR}/krill/$(basename $@)/korapxml2krill.cache -j $(MAX_THREADS) -te ${BUILD_DIR}/krill/$(basename $@) --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+ K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg -c ${BUILD_DIR}/krill/$(basename $@)/korapxml2krill.cache -j 30 -te ${BUILD_DIR}/krill/$(basename $@) --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+ slack "$(basename $@) krill archive created"
%.json: %.krill.tar
rm -rf $@
mkdir -p $@
for f in $<; do tar -C $@ -xf $$f; done
+krill: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).krill.tar)