blob: 7f8c07b5462c7bcdc5ad61b4b4d235a9601bae46 [file] [log] [blame]
Marc Kupietz4292bc02026-03-07 16:12:52 +01001SHELL := /bin/bash
Marc Kupietz89a44072025-12-11 07:14:00 +01002SRC_DIR ?= I5
3
4# Discover all *.i5.xml files in SRC_DIR
5I5_FILES := $(wildcard $(SRC_DIR)/*.i5.xml)
6BASENAMES := $(patsubst %.i5.xml,%,$(notdir $(I5_FILES)))
7
8BUILD_DIR = build
Marc Kupietz15b313d2025-12-11 12:20:56 +01009TARGET_DIR ?= ./target
Marc Kupietz89a44072025-12-11 07:14:00 +010010MAX_THREADS ?= 8 # $(shell nproc)
11MAKE ?= make -j $(shell nproc)
Marc Kupietz15b313d2025-12-11 12:20:56 +010012# KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
Marc Kupietz89a44072025-12-11 07:14:00 +010013KORAPXMLTOOL ?= ./bin/korapxmltool
Marc Kupietz15b313d2025-12-11 12:20:56 +010014KORAPXMLTOOL_MODELS_PATH ?= models
Marc Kupietz4292bc02026-03-07 16:12:52 +010015DOCKER_CPU_SHARES ?= # e.g. 512 for lower priority (default Docker value is 1024)
Marc Kupietz89a44072025-12-11 07:14:00 +010016
17.DELETE_ON_ERROR:
18
Marc Kupietz463da332026-03-08 13:19:15 +010019.PHONY: all clean test index korap check-src pre-krill krill
Marc Kupietz89a44072025-12-11 07:14:00 +010020
Marc Kupietz5b3ecc02025-12-15 14:43:09 +010021.PRECIOUS: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.cmc.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.krill.tar %.i5.xml
Marc Kupietz89a44072025-12-11 07:14:00 +010022
Marc Kupietz15b313d2025-12-11 12:20:56 +010023all: check-src korap
Marc Kupietz89a44072025-12-11 07:14:00 +010024
Marc Kupietz15b313d2025-12-11 12:20:56 +010025index: check-src $(TARGET_DIR)/index
26
27check-src:
28 @if [ ! -d "$(SRC_DIR)" ]; then \
29 echo "Error: SRC_DIR '$(SRC_DIR)' does not exist."; \
30 echo "Please create it and place your .i5.xml files there,"; \
31 echo "or specify a different directory using SRC_DIR variable."; \
32 echo "Example: make SRC_DIR=/path/to/files"; \
33 exit 1; \
34 fi
35 @if [ -z "$$(find "$(SRC_DIR)" -maxdepth 1 -name '*.i5.xml' -print -quit)" ]; then \
36 echo "Error: No .i5.xml files found in '$(SRC_DIR)'."; \
37 echo "Please populate it or set SRC_DIR to a different location."; \
38 exit 1; \
39 fi
Marc Kupietz89a44072025-12-11 07:14:00 +010040
41$(BUILD_DIR)/%.zip: $(SRC_DIR)/%.i5.xml
42 mkdir -p $(BUILD_DIR)
Marc Kupietz4292bc02026-03-07 16:12:52 +010043 docker run --rm -i $(if $(DOCKER_CPU_SHARES),--cpu-shares $(DOCKER_CPU_SHARES)) korap/tei2korapxml:latest -l warn -s -tk - < $< > $@ 2> >(tee $(@:.zip=.log) >&2)
44# docker run --rm $(if $(DOCKER_CPU_SHARES),--cpu-shares $(DOCKER_CPU_SHARES)) -v $(abspath $<):/input.i5.xml:ro korap/tei2korapxml:latest --progress -l warn -s -tk /input.i5.xml > $@ 2> >(tee $(@:.zip=.log) >&2)
45 printf "%s\t%s\n" "$$(grep -c '<idsText ' $<)" "$$(unzip -l $@ | grep data.xml | wc -l)"
Marc Kupietz89a44072025-12-11 07:14:00 +010046
47
48$(BUILD_DIR)/%.tree_tagger.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
Marc Kupietz5b3ecc02025-12-15 14:43:09 +010049 $(KORAPXMLTOOL) -j 1 -T treetagger -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz89a44072025-12-11 07:14:00 +010050# $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
51
52$(BUILD_DIR)/%.spacy.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
Marc Kupietz15b313d2025-12-11 12:20:56 +010053 $(KORAPXMLTOOL) -P spacy -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz89a44072025-12-11 07:14:00 +010054
Marc Kupietz15b313d2025-12-11 12:20:56 +010055lib/Krill-Indexer.jar:
56 mkdir -p lib
57 curl -sL -o $@ https://github.com/korap/Krill/releases/latest/download/Krill-Indexer.jar
Marc Kupietz5b3ecc02025-12-15 14:43:09 +010058
Marc Kupietz89a44072025-12-11 07:14:00 +010059bin/korapxmltool:
60 mkdir -p bin
Marc Kupietz15b313d2025-12-11 12:20:56 +010061 curl -sL -o $@ https://github.com/korap/korapxmltool/releases/latest/download/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010062 chmod +x $@
63
Marc Kupietz15b313d2025-12-11 12:20:56 +010064$(KORAPXMLTOOL_MODELS_PATH)/de.marmot:
65 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010066 curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot
67
Marc Kupietz15b313d2025-12-11 12:20:56 +010068$(KORAPXMLTOOL_MODELS_PATH)/german.mco:
69 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010070 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
71
Marc Kupietz15b313d2025-12-11 12:20:56 +010072$(KORAPXMLTOOL_MODELS_PATH)/dereko_domains_s.classifier:
73 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
74 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/models/$@
75
76$(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger:
77 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010078 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
79
Marc Kupietz15b313d2025-12-11 12:20:56 +010080$(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz:
81 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010082 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
83
Marc Kupietz15b313d2025-12-11 12:20:56 +010084$(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin:
85 mkdir -p $(KORAPXMLTOOL_MODELS_PATH)
Marc Kupietz89a44072025-12-11 07:14:00 +010086 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
87
Marc Kupietz15b313d2025-12-11 12:20:56 +010088$(BUILD_DIR)/%.marmot-malt.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de.marmot $(KORAPXMLTOOL_MODELS_PATH)/german.mco bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010089 $(KORAPXMLTOOL) -T marmot:models/de.marmot -P malt:models/german.mco -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz5b3ecc02025-12-15 14:43:09 +010090
Marc Kupietz15b313d2025-12-11 12:20:56 +010091$(BUILD_DIR)/%.corenlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger $(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010092 $(KORAPXMLTOOL) -T corenlp -P corenlp -t zip --force -D $(BUILD_DIR) $<
93
Marc Kupietz15b313d2025-12-11 12:20:56 +010094$(BUILD_DIR)/%.opennlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin bin/korapxmltool
Marc Kupietz89a44072025-12-11 07:14:00 +010095 $(KORAPXMLTOOL) -T opennlp -t zip --force -D $(BUILD_DIR) $<
96
Marc Kupietz800f9bb2025-12-11 17:30:24 +010097$(BUILD_DIR)/%.cmc.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
Marc Kupietz5b3ecc02025-12-15 14:43:09 +010098 $(KORAPXMLTOOL) -j 1 -A "docker run --rm -i korap/conllu-cmc -s" -l error -F cmc -t zip --force -D $(BUILD_DIR) $<
Marc Kupietz800f9bb2025-12-11 17:30:24 +010099
Marc Kupietza2876eb2026-03-07 22:03:17 +0100100$(BUILD_DIR)/%.gender.zip: $(BUILD_DIR)/%.zip bin/conllu-gender
Marc Kupietz463da332026-03-08 13:19:15 +0100101 $(KORAPXMLTOOL) -j 1 -A "bin/conllu-gender -s" -l WARNING -F gender -t zip --force -D $(BUILD_DIR) $<
Marc Kupietza2876eb2026-03-07 22:03:17 +0100102
Marc Kupietz89a44072025-12-11 07:14:00 +0100103# udpipe target removed as requested
104# %.ud.zip: %.zip
105# $(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
106
Marc Kupietza2876eb2026-03-07 22:03:17 +0100107KRILL_PREREQS := $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).zip $(BUILD_DIR)/$(base).marmot-malt.zip $(BUILD_DIR)/$(base).tree_tagger.zip $(BUILD_DIR)/$(base).spacy.zip $(BUILD_DIR)/$(base).corenlp.zip $(BUILD_DIR)/$(base).opennlp.zip $(BUILD_DIR)/$(base).gender.zip)
Marc Kupietz4292bc02026-03-07 16:12:52 +0100108
109pre-krill: check-src $(KRILL_PREREQS)
110
Marc Kupietza2876eb2026-03-07 22:03:17 +0100111$(BUILD_DIR)/%.krill.tar: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.gender.zip
Marc Kupietz4292bc02026-03-07 16:12:52 +0100112 $(KORAPXMLTOOL) --non-word-tokens -f -t krill -D $(BUILD_DIR) $(basename $<)*.zip
Marc Kupietz89a44072025-12-11 07:14:00 +0100113
Marc Kupietz463da332026-03-08 13:19:15 +0100114krill: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar)
115
Marc Kupietz15b313d2025-12-11 12:20:56 +0100116$(TARGET_DIR)/index: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar)
117 make lib/Krill-Indexer.jar
118 touch lib/krill.cfg
119 mkdir -p $(TARGET_DIR)
120 java -jar lib/Krill-Indexer.jar -c lib/krill.cfg --progress -i $(subst " ",;,$^) -o $@
121
122korap: check-src $(TARGET_DIR)/index
123 curl https://raw.githubusercontent.com/KorAP/KorAP-Docker/master/compose.yaml | INDEX='$(TARGET_DIR)/index' docker compose -p korap -f - --profile=lite --profile=example up
Marc Kupietz89a44072025-12-11 07:14:00 +0100124
125$(TARGET_DIR)/index.tar.xz: $(TARGET_DIR)/index
126 tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<)
127
128clean:
129 rm -rf $(BUILD_DIR) $(TARGET_DIR)