blob: a05b93d4da066ccbe0be6543f33ca89ee8c94f1e [file] [log] [blame]
Marc Kupietz89a44072025-12-11 07:14:00 +01001SRC_DIR ?= I5
2
3# Discover all *.i5.xml files in SRC_DIR
4I5_FILES := $(wildcard $(SRC_DIR)/*.i5.xml)
5BASENAMES := $(patsubst %.i5.xml,%,$(notdir $(I5_FILES)))
6
7BUILD_DIR = build
8TARGET_DIR ?= target
9MAX_THREADS ?= 8 # $(shell nproc)
10MAKE ?= make -j $(shell nproc)
11KORAPXMLTOOL_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 2500))")
12KORAPXMLTOOL ?= ./bin/korapxmltool
13
14.DELETE_ON_ERROR:
15
16.PHONY: all clean test index
17
18
19.PRECIOUS: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip $(BUILD_DIR)/%.krill.tar %.i5.xml
20
21all: index
22
23index: $(TARGET_DIR)/index
24
25$(BUILD_DIR)/%.zip: $(SRC_DIR)/%.i5.xml
26 mkdir -p $(BUILD_DIR)
27 tei2korapxml --progress -l warn -s -tk - < $< > $@
28 printf "%s\t%s\n" "$(grep -c '<idsText ' $<)" "$(unzip -l $@ | grep data.xml | wc -l)"
29
30
31$(BUILD_DIR)/%.tree_tagger.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
32 $(KORAPXMLTOOL) -T treetagger -t zip --force -D $(BUILD_DIR) $<
33# $(KORAPXMLTOOL) $< | pv | docker run --rm -i korap/conllu2treetagger -l german | conllu2korapxml > $@
34
35$(BUILD_DIR)/%.spacy.zip: $(BUILD_DIR)/%.zip bin/korapxmltool
36 $(KORAPXMLTOOL) -T spacy -t zip --force -D $(BUILD_DIR) $<
37
38bin/korapxmltool:
39 mkdir -p bin
40 curl -sL -o $@ https://github.com/korap/korapxmltool/releases/download/v3.1.0/korapxmltool
41 chmod +x $@
42
43models/de.marmot:
44 mkdir -p models
45 curl -sL -o $@ https://cistern.cis.lmu.de/marmot/models/CURRENT/spmrl/de.marmot
46
47models/german.mco:
48 mkdir -p models
49 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
50
51models/dereko_domains_s.classifier:
52 mkdir -p models
53 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
54
55models/german-fast.tagger:
56 mkdir -p models
57 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
58
59models/germanSR.ser.gz:
60 mkdir -p models
61 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
62
63models/de-pos-maxent.bin:
64 mkdir -p models
65 curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
66
67$(BUILD_DIR)/%.marmot-malt.zip:$(BUILD_DIR)/%.zip models/de.marmot models/german.mco bin/korapxmltool
68 $(KORAPXMLTOOL) -T marmot:models/de.marmot -P malt:models/german.mco -t zip --force -D $(BUILD_DIR) $<
69
70$(BUILD_DIR)/%.corenlp.zip: $(BUILD_DIR)/%.zip models/german-fast.tagger models/germanSR.ser.gz bin/korapxmltool
71 $(KORAPXMLTOOL) -T corenlp -P corenlp -t zip --force -D $(BUILD_DIR) $<
72
73$(BUILD_DIR)/%.opennlp.zip: $(BUILD_DIR)/%.zip models/de-pos-maxent.bin bin/korapxmltool
74 $(KORAPXMLTOOL) -T opennlp -t zip --force -D $(BUILD_DIR) $<
75
76# udpipe target removed as requested
77# %.ud.zip: %.zip
78# $(KORAPXMLTOOL) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
79
80$(BUILD_DIR)/%.krill.tar: $(BUILD_DIR)/%.zip $(BUILD_DIR)/%.marmot-malt.zip $(BUILD_DIR)/%.tree_tagger.zip $(BUILD_DIR)/%.spacy.zip $(BUILD_DIR)/%.corenlp.zip $(BUILD_DIR)/%.opennlp.zip
81 K2K_PUBLISHER_STRING=1 K2K_TRANSLATOR_TEXT=1 $(KORAPXMLTOOL) --non-word-tokens -f -t krill -D $(BUILD_DIR) $(basename $<)*.zip
82
83
84$(TARGET_DIR)/index.tar.xz: $(TARGET_DIR)/index
85 tar -I 'xz -T0' -C $(dir $<) -cf $@ $(notdir $<)
86
87clean:
88 rm -rf $(BUILD_DIR) $(TARGET_DIR)
89
90$(TARGET_DIR)/index: $(foreach base,$(BASENAMES),$(BUILD_DIR)/$(base).krill.tar)
91 rm -rf $@
92 mkdir -p $(TARGET_DIR)
93 java -jar lib/Krill-Indexer.jar --progress -c lib/krill.conf -i $(subst " ",;,$^) -o $@