| Marc Kupietz | 4292bc0 | 2026-03-07 16:12:52 +0100 | [diff] [blame] | 1 | SHELL := /bin/bash |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 2 | SRC_DIR ?= I5 |
| 3 | |
| 4 | # Discover all *.i5.xml files in SRC_DIR |
| 5 | I5_FILES := $(wildcard $(SRC_DIR)/*.i5.xml) |
| 6 | BASENAMES := $(patsubst %.i5.xml,%,$(notdir $(I5_FILES))) |
| 7 | |
| 8 | BUILD_DIR = build |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 9 | TARGET_DIR ?= ./target |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 10 | MAX_THREADS ?= 8 # $(shell nproc) |
| 11 | MAKE ?= make -j $(shell nproc) |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 12 | # KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))") |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 13 | KORAPXMLTOOL ?= ./bin/korapxmltool |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 14 | KORAPXMLTOOL_MODELS_PATH ?= models |
| Marc Kupietz | 4292bc0 | 2026-03-07 16:12:52 +0100 | [diff] [blame] | 15 | DOCKER_CPU_SHARES ?= # e.g. 512 for lower priority (default Docker value is 1024) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 16 | |
| 17 | .DELETE_ON_ERROR: |
| 18 | |
| Marc Kupietz | 463da33 | 2026-03-08 13:19:15 +0100 | [diff] [blame] | 19 | .PHONY: all clean test index korap check-src pre-krill krill |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 20 | |
| Marc Kupietz | 5b3ecc0 | 2025-12-15 14:43:09 +0100 | [diff] [blame] | 21 | .PRECIOUS: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.cmc.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.krill.tar %.i5.xml |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 22 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 23 | all: check-src korap |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 24 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 25 | index: check-src $(TARGET_DIR)/index |
| 26 | |
| 27 | check-src: |
| 28 | @if [ ! -d "$(SRC_DIR)" ]; then \ |
| 29 | echo "Error: SRC_DIR '$(SRC_DIR)' does not exist."; \ |
| 30 | echo "Please create it and place your .i5.xml files there,"; \ |
| 31 | echo "or specify a different directory using SRC_DIR variable."; \ |
| 32 | echo "Example: make SRC_DIR=/path/to/files"; \ |
| 33 | exit 1; \ |
| 34 | fi |
| 35 | @if [ -z "$$(find "$(SRC_DIR)" -maxdepth 1 -name '*.i5.xml' -print -quit)" ]; then \ |
| 36 | echo "Error: No .i5.xml files found in '$(SRC_DIR)'."; \ |
| 37 | echo "Please populate it or set SRC_DIR to a different location."; \ |
| 38 | exit 1; \ |
| 39 | fi |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 40 | |
| 41 | $(BUILD_DIR)/%.zip: $(SRC_DIR)/%.i5.xml |
| 42 | mkdir -p $(BUILD_DIR) |
| Marc Kupietz | 4292bc0 | 2026-03-07 16:12:52 +0100 | [diff] [blame] | 43 | docker run --rm -i $(if $(DOCKER_CPU_SHARES),--cpu-shares $(DOCKER_CPU_SHARES)) korap/tei2korapxml:latest -l warn -s -tk - < $< > $@ 2> >(tee $(@:.zip=.log) >&2) |
| 44 | # docker run --rm $(if $(DOCKER_CPU_SHARES),--cpu-shares $(DOCKER_CPU_SHARES)) -v $(abspath $<):/input.i5.xml:ro korap/tei2korapxml:latest --progress -l warn -s -tk /input.i5.xml > $@ 2> >(tee $(@:.zip=.log) >&2) |
| 45 | printf "%s\t%s\n" "$$(grep -c '<idsText ' $<)" "$$(unzip -l $@ | grep data.xml | wc -l)" |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 46 | |
| 47 | |
| 48 | $(BUILD_DIR)/%.tree_tagger.zip: $(BUILD_DIR)/%.zip bin/korapxmltool |
| Marc Kupietz | 5b3ecc0 | 2025-12-15 14:43:09 +0100 | [diff] [blame] | 49 | $(KORAPXMLTOOL) -j 1 -T treetagger -t zip --force -D $(BUILD_DIR) $< |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 50 | # $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@ |
| 51 | |
| 52 | $(BUILD_DIR)/%.spacy.zip: $(BUILD_DIR)/%.zip bin/korapxmltool |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 53 | $(KORAPXMLTOOL) -P spacy -t zip --force -D $(BUILD_DIR) $< |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 54 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 55 | lib/Krill-Indexer.jar: |
| 56 | mkdir -p lib |
| 57 | curl -sL -o $@ https://github.com/korap/Krill/releases/latest/download/Krill-Indexer.jar |
| Marc Kupietz | 5b3ecc0 | 2025-12-15 14:43:09 +0100 | [diff] [blame] | 58 | |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 59 | bin/korapxmltool: |
| 60 | mkdir -p bin |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 61 | curl -sL -o $@ https://github.com/korap/korapxmltool/releases/latest/download/korapxmltool |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 62 | chmod +x $@ |
| 63 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 64 | $(KORAPXMLTOOL_MODELS_PATH)/de.marmot: |
| 65 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 66 | curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot |
| 67 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 68 | $(KORAPXMLTOOL_MODELS_PATH)/german.mco: |
| 69 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 70 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 71 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 72 | $(KORAPXMLTOOL_MODELS_PATH)/dereko_domains_s.classifier: |
| 73 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| 74 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/models/$@ |
| 75 | |
| 76 | $(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger: |
| 77 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 78 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 79 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 80 | $(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz: |
| 81 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 82 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 83 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 84 | $(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin: |
| 85 | mkdir -p $(KORAPXMLTOOL_MODELS_PATH) |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 86 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 87 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 88 | $(BUILD_DIR)/%.marmot-malt.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de.marmot $(KORAPXMLTOOL_MODELS_PATH)/german.mco bin/korapxmltool |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 89 | $(KORAPXMLTOOL) -T marmot:models/de.marmot -P malt:models/german.mco -t zip --force -D $(BUILD_DIR) $< |
| Marc Kupietz | 5b3ecc0 | 2025-12-15 14:43:09 +0100 | [diff] [blame] | 90 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 91 | $(BUILD_DIR)/%.corenlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/german-fast.tagger $(KORAPXMLTOOL_MODELS_PATH)/germanSR.ser.gz bin/korapxmltool |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 92 | $(KORAPXMLTOOL) -T corenlp -P corenlp -t zip --force -D $(BUILD_DIR) $< |
| 93 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 94 | $(BUILD_DIR)/%.opennlp.zip: $(BUILD_DIR)/%.zip $(KORAPXMLTOOL_MODELS_PATH)/de-pos-maxent.bin bin/korapxmltool |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 95 | $(KORAPXMLTOOL) -T opennlp -t zip --force -D $(BUILD_DIR) $< |
| 96 | |
| Marc Kupietz | 800f9bb | 2025-12-11 17:30:24 +0100 | [diff] [blame] | 97 | $(BUILD_DIR)/%.cmc.zip: $(BUILD_DIR)/%.zip bin/korapxmltool |
| Marc Kupietz | 5b3ecc0 | 2025-12-15 14:43:09 +0100 | [diff] [blame] | 98 | $(KORAPXMLTOOL) -j 1 -A "docker run --rm -i korap/conllu-cmc -s" -l error -F cmc -t zip --force -D $(BUILD_DIR) $< |
| Marc Kupietz | 800f9bb | 2025-12-11 17:30:24 +0100 | [diff] [blame] | 99 | |
| Marc Kupietz | a2876eb | 2026-03-07 22:03:17 +0100 | [diff] [blame] | 100 | $(BUILD_DIR)/%.gender.zip: $(BUILD_DIR)/%.zip bin/conllu-gender |
| Marc Kupietz | 463da33 | 2026-03-08 13:19:15 +0100 | [diff] [blame] | 101 | $(KORAPXMLTOOL) -j 1 -A "bin/conllu-gender -s" -l WARNING -F gender -t zip --force -D $(BUILD_DIR) $< |
| Marc Kupietz | a2876eb | 2026-03-07 22:03:17 +0100 | [diff] [blame] | 102 | |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 103 | # udpipe target removed as requested |
| 104 | # %.ud.zip: %.zip |
| 105 | # $(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@ |
| 106 | |
| Marc Kupietz | a2876eb | 2026-03-07 22:03:17 +0100 | [diff] [blame] | 107 | KRILL_PREREQS := $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).zip $(BUILD_DIR)/$(base).marmot-malt.zip $(BUILD_DIR)/$(base).tree_tagger.zip $(BUILD_DIR)/$(base).spacy.zip $(BUILD_DIR)/$(base).corenlp.zip $(BUILD_DIR)/$(base).opennlp.zip $(BUILD_DIR)/$(base).gender.zip) |
| Marc Kupietz | 4292bc0 | 2026-03-07 16:12:52 +0100 | [diff] [blame] | 108 | |
| 109 | pre-krill: check-src $(KRILL_PREREQS) |
| 110 | |
| Marc Kupietz | a2876eb | 2026-03-07 22:03:17 +0100 | [diff] [blame] | 111 | $(BUILD_DIR)/%.krill.tar: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.gender.zip |
| Marc Kupietz | 4292bc0 | 2026-03-07 16:12:52 +0100 | [diff] [blame] | 112 | $(KORAPXMLTOOL) --non-word-tokens -f -t krill -D $(BUILD_DIR) $(basename $<)*.zip |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 113 | |
| Marc Kupietz | 463da33 | 2026-03-08 13:19:15 +0100 | [diff] [blame] | 114 | krill: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar) |
| 115 | |
| Marc Kupietz | 15b313d | 2025-12-11 12:20:56 +0100 | [diff] [blame] | 116 | $(TARGET_DIR)/index: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar) |
| 117 | make lib/Krill-Indexer.jar |
| 118 | touch lib/krill.cfg |
| 119 | mkdir -p $(TARGET_DIR) |
| 120 | java -jar lib/Krill-Indexer.jar -c lib/krill.cfg --progress -i $(subst " ",;,$^) -o $@ |
| 121 | |
| 122 | korap: check-src $(TARGET_DIR)/index |
| 123 | curl https://raw.githubusercontent.com/KorAP/KorAP-Docker/master/compose.yaml | INDEX='$(TARGET_DIR)/index' docker compose -p korap -f - --profile=lite --profile=example up |
| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame] | 124 | |
| 125 | $(TARGET_DIR)/index.tar.xz: $(TARGET_DIR)/index |
| 126 | tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<) |
| 127 | |
| 128 | clean: |
| 129 | rm -rf $(BUILD_DIR) $(TARGET_DIR) |