blob: 349118f798d55fbaed440888f2c0a76bae4df3d7 [file] [log] [blame]
Marc Kupietz89a44072025-12-11 07:14:00 +01001SRC_DIR ?= I5
2
3# Discover all *.i5.xml files in SRC_DIR
4I5_FILES := $(wildcard $(SRC_DIR)/*.i5.xml)
5BASENAMES := $(patsubst %.i5.xml,%,$(notdir $(I5_FILES)))
6
7BUILD_DIR = build
Marc Kupietz15b313d2025-12-11 12:20:56 +01008TARGET_DIR ?= ./target
Marc Kupietz89a44072025-12-11 07:14:00 +01009MAX_THREADS ?= 8 # $(shell nproc)
10MAKE ?= make -j $(shell nproc)
Marc Kupietz15b313d2025-12-11 12:20:56 +010011# KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
Marc Kupietz89a44072025-12-11 07:14:00 +010012KORAPXMLTOOL ?= ./bin/korapxmltool
Marc Kupietz15b313d2025-12-11 12:20:56 +010013KORAPXMLTOOL_MODELS_PATH ?= models
Marc Kupietz89a44072025-12-11 07:14:00 +010014
15.DELETE_ON_ERROR:
16
Marc Kupietz15b313d2025-12-11 12:20:56 +010017.PHONY: all clean test index korap check-src
Marc Kupietz89a44072025-12-11 07:14:00 +010018
19.PRECIOUS: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.krill.tar %.i5.xml
20
Marc Kupietz15b313d2025-12-11 12:20:56 +010021all: check-src korap
Marc Kupietz89a44072025-12-11 07:14:00 +010022
Marc Kupietz15b313d2025-12-11 12:20:56 +010023index: check-src $(TARGET_DIR)/index
24
25check-src:
26 @if [ ! -d "$(SRC_DIR)" ]; then \
27 echo "Error: SRC_DIR '$(SRC_DIR)' does not exist."; \
28 echo "Please create it and place your .i5.xml files there,"; \
29 echo "or specify a different directory using SRC_DIR variable."; \
30 echo "Example: make SRC_DIR=/path/to/files"; \
31 exit 1; \
32 fi
33 @if [ -z "$$(find "$(SRC_DIR)" -maxdepth 1 -name '*.i5.xml' -print -quit)" ]; then \
34 echo "Error: No .i5.xml files found in '$(SRC_DIR)'."; \
35 echo "Please populate it or set SRC_DIR to a different location."; \
36 exit 1; \
37 fi
Marc Kupietz89a44072025-12-11 07:14:00 +010038
39$(BUILD_DIR)/%.zip: $(SRC_DIR)/%.i5.xml
40 mkdir -p $(BUILD_DIR)
Marc Kupietz15b313d2025-12-11 12:20:56 +010041 tei2korapxml --progress -l warn -s -tk $< > $@
Marc Kupietz89a44072025-12-11 07:14:00 +010042 printf "%s\t%s\n" "$(grep -c '<idsText ' $<)" "$(unzip -l $@ | grep data.xml | wc -l)"
43
44
45$(BUILD_DIR)/%.tree_tagger.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
46 $(KORAPXMLTOOL) -T treetagger -t zip --force -D $(BUILD_DIR) $<
47# $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
48
49$(BUILD_DIR)/%.spacy.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
Marc Kupietz15b313d2025-12-11 12:20:56 +010050 $(KORAPXMLTOOL) -P spacy -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz89a44072025-12-11 07:14:00 +010051
Marc Kupietz15b313d2025-12-11 12:20:56 +010052lib/Krill-Indexer.jar:
53 mkdir -p lib
54 curl -sL -o $@ https://github.com/korap/Krill/releases/latest/download/Krill-Indexer.jar
55
Marc Kupietz89a44072025-12-11 07:14:00 +010056bin/korapxmltool:
57 mkdir -p bin
Marc Kupietz15b313d2025-12-11 12:20:56 +010058 curl -sL -o $@ https://github.com/korap/korapxmltool/releases/latest/download/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010059 chmod +x $@
60
Marc Kupietz15b313d2025-12-11 12:20:56 +010061$(KORAPXMLTOOL_MODELS_PATH)/de.marmot:
62 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010063 curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot
64
Marc Kupietz15b313d2025-12-11 12:20:56 +010065$(KORAPXMLTOOL_MODELS_PATH)/german.mco:
66 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010067 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
68
Marc Kupietz15b313d2025-12-11 12:20:56 +010069$(KORAPXMLTOOL_MODELS_PATH)/dereko_domains_s.classifier:
70 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
71 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/models/$@
72
73$(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger:
74 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010075 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
76
Marc Kupietz15b313d2025-12-11 12:20:56 +010077$(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz:
78 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010079 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
80
Marc Kupietz15b313d2025-12-11 12:20:56 +010081$(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin:
82 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010083 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
84
Marc Kupietz15b313d2025-12-11 12:20:56 +010085$(BUILD_DIR)/%.marmot-malt.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de.marmot $(KORAPXMLTOOL_MODELS_PATH)/german.mco bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010086 $(KORAPXMLTOOL) -T marmot:models/de.marmot -P malt:models/german.mco -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz15b313d2025-12-11 12:20:56 +010087
88$(BUILD_DIR)/%.corenlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger $(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010089 $(KORAPXMLTOOL) -T corenlp -P corenlp -t zip --force -D $(BUILD_DIR) $<
90
Marc Kupietz15b313d2025-12-11 12:20:56 +010091$(BUILD_DIR)/%.opennlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010092 $(KORAPXMLTOOL) -T opennlp -t zip --force -D $(BUILD_DIR) $<
93
94# udpipe target removed as requested
95# %.ud.zip: %.zip
96# $(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
97
Marc Kupietz15b313d2025-12-11 12:20:56 +010098$(BUILD_DIR)/%.krill.tar: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip
Marc Kupietz89a44072025-12-11 07:14:00 +010099 K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 $(KORAPXMLTOOL) --non-word-tokens -f -t krill -D $(BUILD_DIR) $(basename $<)*.zip
100
Marc Kupietz15b313d2025-12-11 12:20:56 +0100101$(TARGET_DIR)/index: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar)
102 make lib/Krill-Indexer.jar
103 touch lib/krill.cfg
104 mkdir -p $(TARGET_DIR)
105 java -jar lib/Krill-Indexer.jar -c lib/krill.cfg --progress -i $(subst " ",;,$^) -o $@
106
107korap: check-src $(TARGET_DIR)/index
108 curl https://raw.githubusercontent.com/KorAP/KorAP-Docker/master/compose.yaml | INDEX='$(TARGET_DIR)/index' docker compose -p korap -f - --profile=lite --profile=example up
Marc Kupietz89a44072025-12-11 07:14:00 +0100109
110$(TARGET_DIR)/index.tar.xz: $(TARGET_DIR)/index
111 tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<)
112
113clean:
114 rm -rf $(BUILD_DIR) $(TARGET_DIR)