| Marc Kupietz | 89a4407 | 2025-12-11 07:14:00 +0100 | [diff] [blame^] | 1 | SRC_DIR ?= I5 |
| 2 | |
| 3 | # Discover all *.i5.xml files in SRC_DIR |
| 4 | I5_FILES := $(wildcard $(SRC_DIR)/*.i5.xml) |
| 5 | BASENAMES := $(patsubst %.i5.xml,%,$(notdir $(I5_FILES))) |
| 6 | |
| 7 | BUILD_DIR = build |
| 8 | TARGET_DIR ?= target |
| 9 | MAX_THREADS ?= 8 # $(shell nproc) |
| 10 | MAKE ?= make -j $(shell nproc) |
| 11 | KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))") |
| 12 | KORAPXMLTOOL ?= ./bin/korapxmltool |
| 13 | |
| 14 | .DELETE_ON_ERROR: |
| 15 | |
| 16 | .PHONY: all clean test index |
| 17 | |
| 18 | |
| 19 | .PRECIOUS: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.krill.tar %.i5.xml |
| 20 | |
| 21 | all: index |
| 22 | |
| 23 | index: $(TARGET_DIR)/index |
| 24 | |
| 25 | $(BUILD_DIR)/%.zip: $(SRC_DIR)/%.i5.xml |
| 26 | mkdir -p $(BUILD_DIR) |
| 27 | tei2korapxml --progress -l warn -s -tk - < $< > $@ |
| 28 | printf "%s\t%s\n" "$(grep -c '<idsText ' $<)" "$(unzip -l $@ | grep data.xml | wc -l)" |
| 29 | |
| 30 | |
| 31 | $(BUILD_DIR)/%.tree_tagger.zip: $(BUILD_DIR)/%.zip bin/korapxmltool |
| 32 | $(KORAPXMLTOOL) -T treetagger -t zip --force -D $(BUILD_DIR) $< |
| 33 | # $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@ |
| 34 | |
| 35 | $(BUILD_DIR)/%.spacy.zip: $(BUILD_DIR)/%.zip bin/korapxmltool |
| 36 | $(KORAPXMLTOOL) -T spacy -t zip --force -D $(BUILD_DIR) $< |
| 37 | |
| 38 | bin/korapxmltool: |
| 39 | mkdir -p bin |
| 40 | curl -sL -o $@ https://github.com/korap/korapxmltool/releases/download/v3.1.0/korapxmltool |
| 41 | chmod +x $@ |
| 42 | |
| 43 | models/de.marmot: |
| 44 | mkdir -p models |
| 45 | curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot |
| 46 | |
| 47 | models/german.mco: |
| 48 | mkdir -p models |
| 49 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 50 | |
| 51 | models/dereko_domains_s.classifier: |
| 52 | mkdir -p models |
| 53 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 54 | |
| 55 | models/german-fast.tagger: |
| 56 | mkdir -p models |
| 57 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 58 | |
| 59 | models/germanSR.ser.gz: |
| 60 | mkdir -p models |
| 61 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 62 | |
| 63 | models/de-pos-maxent.bin: |
| 64 | mkdir -p models |
| 65 | curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@ |
| 66 | |
| 67 | $(BUILD_DIR)/%.marmot-malt.zip:$(BUILD_DIR)/%.zip models/de.marmot models/german.mco bin/korapxmltool |
| 68 | $(KORAPXMLTOOL) -T marmot:models/de.marmot -P malt:models/german.mco -t zip --force -D $(BUILD_DIR) $< |
| 69 | |
| 70 | $(BUILD_DIR)/%.corenlp.zip: $(BUILD_DIR)/%.zip models/german-fast.tagger models/germanSR.ser.gz bin/korapxmltool |
| 71 | $(KORAPXMLTOOL) -T corenlp -P corenlp -t zip --force -D $(BUILD_DIR) $< |
| 72 | |
| 73 | $(BUILD_DIR)/%.opennlp.zip: $(BUILD_DIR)/%.zip models/de-pos-maxent.bin bin/korapxmltool |
| 74 | $(KORAPXMLTOOL) -T opennlp -t zip --force -D $(BUILD_DIR) $< |
| 75 | |
| 76 | # udpipe target removed as requested |
| 77 | # %.ud.zip: %.zip |
| 78 | # $(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@ |
| 79 | |
| 80 | $(BUILD_DIR)/%.krill.tar: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip |
| 81 | K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 $(KORAPXMLTOOL) --non-word-tokens -f -t krill -D $(BUILD_DIR) $(basename $<)*.zip |
| 82 | |
| 83 | |
| 84 | $(TARGET_DIR)/index.tar.xz: $(TARGET_DIR)/index |
| 85 | tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<) |
| 86 | |
| 87 | clean: |
| 88 | rm -rf $(BUILD_DIR) $(TARGET_DIR) |
| 89 | |
| 90 | $(TARGET_DIR)/index: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar) |
| 91 | rm -rf $@ |
| 92 | mkdir -p $(TARGET_DIR) |
| 93 | java -jar lib/Krill-Indexer.jar --progress -c lib/krill.conf -i $(subst " ",;,$^) -o $@ |