Add preliminary support for split into annual volumes
and drop support for not splitting
e.g. `make test YY=18`
does not yet work for the index
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 534c233..732b8b5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -56,13 +56,13 @@
- apt-cache/
- key:
files:
- - target/dnb.i5.xml
+ - target/dnb18.i5.xml
paths:
- - target/dnb.zip
- - target/dnb.tree_tagger.zip
- - target/dnb.marmot-malt.zip
- - target/dnb.spacy.zip
- - target/dnb.krill.tar
+ - target/dnb18.zip
+ - target/dnb18.tree_tagger.zip
+ - target/dnb18.marmot-malt.zip
+ - target/dnb18.spacy.zip
+ - target/dnb18.krill.tar
before_script:
- source `find .. -name section_helper.sh`
- start_section install_linux_packages "Installing missing Linux packages"
@@ -92,12 +92,12 @@
- touch target/*.zip # ignore timestamps in make
- sleep 1
- touch target/*.*.zip
- - MAX_THREADS=2 make -j $(nproc) target/dnb.index.tar.xz
+ - MAX_THREADS=2 make -j $(nproc) target/dnb18.index.tar.xz
- end_section building_krill
artifacts:
paths:
- target/*.zip
- - target/dnb.index.tar.xz
+ - target/dnb18.index.tar.xz
deploy:
stage: deploy
@@ -118,6 +118,6 @@
- end_section setup_ssh
script:
- start_section korapxmlu "Uploading index to KorAP4DNB instance"
- - touch target/dnb.index.tar.xz
+ - touch target/dnb18.index.tar.xz
- make deploy
- end_section deploy
diff --git a/Makefile b/Makefile
index 8b5e4c4..475a863 100644
--- a/Makefile
+++ b/Makefile
@@ -5,6 +5,7 @@
DEPLOY_USER ?= korap
DEPLOY_PATH ?= /export/netapp/korap4dnb
MAX_THREADS ?= $(shell nproc)
+YY ?= 18
.PHONY: all clean test krill index deploy server-log server-status
@@ -14,17 +15,17 @@
all: index
-krill: $(TARGET_DIR)/dnb.krill.tar
-index: $(TARGET_DIR)/dnb.index.tar.xz
+krill: $(TARGET_DIR)/dnb$(YY).krill.tar
+index: $(TARGET_DIR)/dnb$(YY).index.tar.xz
KORAPXML2CONLLU ?= java -jar lib/korapxml2conllu.jar
-$(TARGET_DIR)/dnb.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
- head -n -1 xslt/idsCorpus-template.xml > $@
- cat $^ >> $@
+$(TARGET_DIR)/dnb$(YY).i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
+ head -n -1 xslt/idsCorpus-template.xml | sed -e 's/{YY}/$(YY)/' > $@
+ for f in $^; do if head -500 $$f | grep -Eq '<pubDate type="year">..$(YY)'; then cat $$f >> $@; fi; done
tail -n 1 xslt/idsCorpus-template.xml >> $@
-test: $(TARGET_DIR)/dnb.i5.xml
+test: $(TARGET_DIR)/dnb$(YY).i5.xml
xmllint --noout --valid $<
$(BUILD_DIR)/%: $(SRC_DIR)/%.epub
@@ -55,7 +56,7 @@
curl -sL -o $@ https://corpora.ids-mannheim.de/tools/$@
%.marmot-malt.zip: %.zip models/de.marmot models/german.mco
- $(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | tee $(TARGET_DIR)/dnb.marmot-malt.conllu | conllu2korapxml > $@
+ $(KORAPXML2CONLLU) -T $(MAX_THREADS) -t marmot:models/de.marmot -P malt:models/german.mco $< | tee $(TARGET_DIR)/dnb$(YY).marmot-malt.conllu | conllu2korapxml > $@
%.ud.zip: %.zip
$(KORAPXML2CONLLU) $< | pv | ./scripts/udpipe2 | conllu2korapxml > $@
diff --git a/Readme.md b/Readme.md
index 390fc8c..b6426e3 100644
--- a/Readme.md
+++ b/Readme.md
@@ -5,7 +5,7 @@
### To generate I5 corpus
```bash
-make -j $(nproc) target/dnb.i5.xml
+make -j $(nproc) target/dnb18.i5.xml YY=18
```
### To generate the KorAP-XML ZIP
@@ -13,12 +13,12 @@
Prerequisite: [KorAP-XML-CoNLL-U](https://github.com/KorAP/KorAP-XML-CoNLL-U)
```bash
-make -j $(nproc) target/dnb.zip
+make -j $(nproc) target/dnb23.zip YY=23
```
### To generate Annotations
-Install prerequisite korap/conllu2treetagger and korap/conllu2spacy docker imeges if not present:
+Install prerequisite korap/conllu2treetagger and korap/conllu2spacy docker images if not present:
```bash
docker image inspect korap/conllu2treetagger:latest || curl -Ls 'https://gitlab.ids-mannheim.de/KorAP/CoNLL-U-Treetagger/-/jobs/artifacts/master/raw/conllu2treetagger.xz?job=build-docker-image' | docker load
@@ -29,19 +29,26 @@
Make annotations:
```bash
-make -j $(nproc) target/dnb.marmot-malt.zip target/dnb.spacy.zip target/dnb.tree_tagger.zip
+make -j $(nproc) target/dnb20.marmot-malt.zip target/dnb20.spacy.zip target/dnb20.tree_tagger.zip YY=20
```
### To build KorAP index (also directly)
-Build KorAP all, up to the deployable index:
+Build KorAP all, up to the ~~deployable~~ index:
```bash
-make -j $(nproc) all
+make -j $(nproc) all YY=23
```
## News
+* 2024-04-10
+ * multiple authors (and non-authors) are now correctly handled
+ * some more .(x)html files are now dropped (toc, cover, etc.)
+ * **PRELIMINARY** support for splitting everything into annual volumes
+ * use `make YY=22` to select 2022
+ * does not yet work for the index!
+
* 2024-03-24
* slow udpipe2 dropped
* added marmot POS and morpho-syntactic annotations
diff --git a/xslt/epub2i5.xsl b/xslt/epub2i5.xsl
index e737f5b..fad5d88 100644
--- a/xslt/epub2i5.xsl
+++ b/xslt/epub2i5.xsl
@@ -162,7 +162,7 @@
<!-- END variables derived from sru request to dnb archive -->
- <xsl:variable name="corpus_sigle" select="'DNB'"/>
+ <xsl:variable name="corpus_sigle" select="concat('DNB', substring($erscheinungsjahr, 3, 2))"/>
<!-- for BOTD: -->
<!-- Dokumentsigle muss zusammen mit Korpussigle (z.B DIV fuer loz-div und loz-div-pub) eindeutig sein -->
diff --git a/xslt/idsCorpus-template.xml b/xslt/idsCorpus-template.xml
index 80608a2..4d042cb 100644
--- a/xslt/idsCorpus-template.xml
+++ b/xslt/idsCorpus-template.xml
@@ -4,7 +4,7 @@
<idsHeader TEIform="teiHeader" pattern="allesaußerZtg/Zschr" status="new" type="corpus" version="1.1">
<fileDesc>
<titleStmt>
- <korpusSigle>DNB</korpusSigle>
+ <korpusSigle>DNB{YY}</korpusSigle>
<c.title>Deutsche Nationalbibliothek: Belletristik</c.title>
</titleStmt>
<publicationStmt>