Sanitize Makefile by dropping YY - use YEARS instead
diff --git a/Makefile b/Makefile
index b48b0f7..e1fbb6f 100644
--- a/Makefile
+++ b/Makefile
@@ -1,13 +1,21 @@
+# Change the SRC_DIR to the directory containing the DNB EPUB files, e.g. with 
+# make -j 96 target/dnb13.index.tar.xz SRC_DIR=../sample.10000
+
 SRC_DIR ?= test/resources/DNB
+
+# Change YEARS to the years you want to process, e.g. with 
+# make -j12 i5valid YEARS="18 19"
+
+YEARS ?= $(shell seq -w 1998 2024 | sed 's/^.*\([0-9][0-9]\)/\1/')
+
 BUILD_DIR = build
 TARGET_DIR ?= target
 DEPLOY_HOST ?= compute.ids-mannheim.de
 DEPLOY_USER ?= korap
 DEPLOY_PATH ?= /export/netapp/korap4dnb
 MAX_THREADS ?= $(shell nproc)
-YY ?= 18
 MAKE ?= make -j $(shell nproc)
-KORAPXML2CONLLU_HEAP ?= $(shell echo "$$(($$(nproc) * 1625))")
+KORAPXML2CONLLU_HEAP ?= $(shell echo "$$(($(MAX_THREADS) * 1625))")
 KORAPXML2CONLLU ?= java -Xmx$(KORAPXML2CONLLU_HEAP)m -jar lib/korapxml2conllu.jar
 SAXON ?= java -cp lib/saxon9ee.jar:lib/xml-resolver-1.2.jar net.sf.saxon.Transform -expand:off -catalog:"lib/dtds/xhtml11/xhtmlcatalog.xml;lib/dtds/xhtml/dtd/xhtmlcatalog.xml"
 
@@ -19,41 +27,40 @@
 
 all: index
 
-krill: $(TARGET_DIR)/dnb$(YY).krill.tar
-index: $(TARGET_DIR)/dnb$(YY).index.tar.xz
+krill: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).krill.tar)
 
-$(TARGET_DIR)/dnb$(YY).i5.xml: $(TARGET_DIR)/dnb$(YY).pre.i5.xml  xslt/pass2.xsl xslt/pass3.xsl
+index: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).index.tar.xz)
+
+$(TARGET_DIR)/dnb%.i5.xml: $(TARGET_DIR)/dnb%.pre.i5.xml  xslt/pass2.xsl xslt/pass3.xsl
 	$(SAXON) -xsl:xslt/pass2.xsl $< | $(SAXON) -xsl:xslt/pass3.xsl - > $@
 
 
-$(TARGET_DIR)/dnb$(YY).pre.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*0.epub)) > $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*1.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*2.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*3.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*4.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*5.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*6.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*7.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*8.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*9.epub)) >> $(TARGET_DIR)/filelist$(YY).txt
-	sed -i -e 's/ /\n/g; /^$$/d' $(TARGET_DIR)/filelist$(YY).txt
-	head -n -1 xslt/idsCorpus-template.xml | sed -e 's/{YY}/$(YY)/' > $@
+$(TARGET_DIR)/dnb%.pre.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*0.epub)) > $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*1.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*2.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*3.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*4.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*5.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*6.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*7.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*8.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	@echo $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*9.epub)) >> $(TARGET_DIR)/filelist$*.txt
+	sed -i -e 's/ /\n/g; /^$$/d' $(TARGET_DIR)/filelist$*.txt
+	head -n -1 xslt/idsCorpus-template.xml | sed -e 's/{YY}/$*/' > $@
 	@while IFS= read -r f; do \
-		if head -500 "$$f" | grep -Eq '<pubDate type="year">..$(YY)'; then \
+		if head -500 "$$f" | grep -Eq '<pubDate type="year">..$*'; then \
 			cat "$$f" >> $@; \
 		fi; \
-	done < $(TARGET_DIR)/filelist$(YY).txt
+	done < $(TARGET_DIR)/filelist$*.txt
 	tail -n 1 xslt/idsCorpus-template.xml  >> $@
 
-test: $(TARGET_DIR)/dnb$(YY).i5.xml
-	xmllint --noout --valid $<
+test: i5valid
 
-i5: $(TARGET_DIR)/dnb$(YY).i5.xml
-	xmllint --noout $<
+i5: $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).i5.xml)
 
-i5valid: $(TARGET_DIR)/dnb$(YY).i5.xml
-	xmllint --noout --valid $<
+i5valid: i5
+	xmllint --noout --valid $(foreach year,$(YEARS),$(TARGET_DIR)/dnb$(year).i5.xml)
 
 $(BUILD_DIR)/%: $(SRC_DIR)/%.epub
 	mkdir -p $@
@@ -84,7 +91,7 @@
 	curl -sL -o $@  https://corpora.ids-mannheim.de/tools/$@
 
 %.marmot-malt.zip: %.zip models/de.marmot models/german.mco
-	$(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | tee $(TARGET_DIR)/dnb$(YY).marmot-malt.conllu | conllu2korapxml > $@
+	$(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | conllu2korapxml > $@
 
 %.ud.zip: %.zip
 	$(KORAPXML2CONLLU) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
@@ -118,13 +125,3 @@
 
 clean:
 	rm -rf $(BUILD_DIR) $(TARGET_DIR)
-
-alli5: i5
-	for yy in $(shell seq -f "%02.f" 95 99) $(shell seq -f "%02.f" 0 24); do \
-	    $(MAKE) i5 YY=$$yy; \
-	done
-
-allindex: i5
-	for yy in $(shell seq -f "%02.f" 95 99) $(shell seq -f "%02.f" 0 24); do \
-	    $(MAKE) index YY=$$yy & \
-	done
diff --git a/Readme.md b/Readme.md
index b6426e3..35f9eee 100644
--- a/Readme.md
+++ b/Readme.md
@@ -2,18 +2,24 @@
 
 ## Run
 
-### To generate I5 corpus
+### To generate an I5 corpus
 
 ```bash
-make -j $(nproc) target/dnb18.i5.xml YY=18
+make -j $(nproc) target/dnb18.i5.xml
 ```
 
-### To generate the KorAP-XML ZIP
+### To generate an all I5 corpora
+
+```bash
+make -j $(nproc) i5
+```
+
+### To generate a KorAP-XML ZIP
 
 Prerequisite: [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U)
 
 ```bash
-make -j $(nproc) target/dnb23.zip YY=23
+make -j $(nproc) target/dnb23.zip
 ```
 
 ### To generate Annotations
@@ -26,10 +32,10 @@
 docker image inspect korap/conllu2spacy:latest || curl -Ls https://corpora.ids-mannheim.de/tools/conllu2spacy.tar.xz | docker load
 ```
 
-Make annotations:
+Make annotations fro dnb20:
 
 ```bash
-make -j $(nproc) target/dnb20.marmot-malt.zip target/dnb20.spacy.zip target/dnb20.tree_tagger.zip YY=20
+make -j $(nproc) target/dnb20.marmot-malt.zip target/dnb20.spacy.zip target/dnb20.tree_tagger.zip
 ```
 
 ### To build KorAP index (also directly)
@@ -37,11 +43,19 @@
 Build KorAP all, up to the ~~deployable~~ index:
 
 ```bash
-make -j $(nproc) all YY=23
+make -j $(nproc) all
 ```
 
 ## News
 
+* 2024-04-15
+  * added pass2 and pass3 to xslt conversion to …
+    * fix div, p, hi, ref … nestings
+    * remove empty elements
+    * join subsequent hi elements
+  * improved korapxml2krill performance by using all cores (-1 does not work here)
+  * sanitized the Makefile and dropped YY variable, use YEARS instead
+
 * 2024-04-10
   * multiple authors (and non-authors) are now correctly handled
   * some more .(x)html files are now dropped (toc, cover, etc.)