Add ci pipeline
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..9a3d737
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,126 @@
+
+stages:
+ - test
+ - build
+ - deploy
+
+variables:
+ VID: $VID
+ DEBIAN_FRONTEND: noninteractive
+ APT_CACHE_DIR: apt-cache
+
+build-and-test-i5:
+ stage: test
+ image: rocker/verse
+ artifacts:
+ paths:
+ - "target/*.i5.xml"
+ - "target/*.zip"
+ cache:
+ - key: DNB4KorAP
+ paths:
+ - apt-cache/
+
+ before_script:
+ - source `find .. -name section_helper.sh`
+ - start_section install_linux_packages "Installing missing Linux packages"
+ - apt-get -o dir::cache::archives="$APT_CACHE_DIR" update
+ - apt-get -o dir::cache::archives="$APT_CACHE_DIR" install -y libxml2-utils
+ - end_section install_linux_packages
+
+ script:
+ - start_section checking_i5 "Building and testing I5 files"
+ - make -j $(nproc) test
+ - end_section checking_i5
+
+
+build-krill:
+ image: perl:5.38
+ stage: build
+ rules:
+ - if: $CI_COMMIT_TAG =~ /.+/
+ variables:
+ VID: $CI_COMMIT_TAG
+ - when: manual
+ variables:
+ VID: $CI_COMMIT_BRANCH-$CI_COMMIT_SHORT_SHA
+ PERL_LOCAL_LIB_ROOT: ./perl5
+ PERL5LIB: ./perl5/lib/perl5
+ cache:
+ - key: DNB4KorAP
+ paths:
+ - $PERL_LOCAL_LIB_ROOT
+ - perl5/
+ - apt-cache/
+ - key:
+ files:
+ - target/dnb.i5.xml
+ paths:
+ - target/dnb.zip
+ - target/dnb.tree_tagger.zip
+ - target/dnb.ud.zip
+ - target/dnb.cmc.zip
+ - target/dnb.spacy.zip
+ - target/dnb.krill.tar
+ before_script:
+ - source `find .. -name section_helper.sh`
+ - start_section install_linux_packages "Installing missing Linux packages"
+ - mkdir -pv $APT_CACHE_DIR
+ - apt-get -o dir::cache::archives="$APT_CACHE_DIR" update
+ - apt-get -o dir::cache::archives="$APT_CACHE_DIR" install -y npm rsync pv jq curl openjdk-17-jre-headless
+ - end_section install_linux_packages
+
+ - start_section install_npm_packages "Installing missing Node packages"
+ - npm install -g 'git+https://gitlab.ids-mannheim.de/KorAP/korap-conllu-cmc.git'
+ - end_section install_npm_packages
+
+ - start_section install_perl_packages "Installing missing Perl packages"
+ - curl -L https://cpanmin.us | perl - App::cpanminus
+ - cpanm -n -l $PERL_LOCAL_LIB_ROOT File::ShareDir::Install https://github.com/KorAP/KorAP-XML-TEI.git
+ - cpanm -n -l $PERL_LOCAL_LIB_ROOT https://github.com/KorAP/KorAP-XML-Krill.git
+ - cpanm -n -l $PERL_LOCAL_LIB_ROOT https://github.com/KorAP/KorAP-XML-CoNLL-U.git
+ - end_section install_perl_packages
+
+ script:
+ - export PATH=$PERL_LOCAL_LIB_ROOT/bin:$PATH
+ - start_section building_krill "Building Krill"
+ - touch *.i5.xml *.zip *.krill.tar # ignore timestamps in make
+ - make -j $(nproc) krill
+ - ls -l target/*.krill.tar
+ - end_section building_krill
+ artifacts:
+ paths:
+ - "target/*.krill.tar"
+ - "target/*.zip"
+
+deploy:
+ stage: deploy
+ dependencies:
+ - "build-krill"
+ when: manual
+ image: rocker/verse
+ before_script:
+ - source `find .. -name section_helper.sh`
+ - start_section setup_ssh "Setting up SSH"
+ - apt-get update
+ - apt-get install -y rsync openssh-client
+ - mkdir -p ~/.ssh
+ - chmod 700 ~/.ssh
+ - eval $(ssh-agent -s)
+ - chmod 400 $SSH_PRIVATE_KEY
+ - ssh-add $SSH_PRIVATE_KEY
+ - end_section setup_ssh
+ script:
+ - start_section korapxmlu "Uploading Krill to KorAP instance dnb"
+ - if [ $(ls target/*.krill.tar | wc -l) -lt 1 ]; then echo 'error - less than 1 Krill files found'; false; fi
+ - rm -rf json && mkdir -p json
+ - for f in target/*.krill.tar; do tar -C json -xf $f; done
+ - rsync -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null" -avz --delete json korap@$DEPLOY_SERVER:/opt/korap/instance-dnb/
+ - end_section korapxmlu
+ - start_section korapxmlr "Indexing data & restarting KorAP instance dnb"
+ - ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null korap@$DEPLOY_SERVER "cd /opt/korap/instance-dnb/ &&
+ rm -rf index && mkdir -p index &&
+ docker run -u root --rm -v /opt/korap/instance-dnb:/data:z korap/kustvakt:latest-full Krill-Indexer.jar -c /kustvakt/kustvakt.conf -i /data/json -o /data/index/ && INDEX=./index docker-compose --profile=full -p kyc-ger restart"
+ - end_section korapxmlr
+ - echo "Deploying $VID"
+ - end_section deploy
diff --git a/Makefile b/Makefile
index 7ec798d..e43f4a0 100644
--- a/Makefile
+++ b/Makefile
@@ -2,13 +2,14 @@
BUILD_DIR = build
TARGET_DIR ?= target
+.PHONY: all clean test krill
-
-.PHONY: all clean test
-
+.PRECIOUS: $(TARGET_DIR)/%.zip $(TARGET_DIR)/%.i5.xml $(TARGET_DIR)/%.tar
all: $(TARGET_DIR)/dnb.i5.xml
+krill: $(TARGET_DIR)/dnb.krill.tar
+
$(TARGET_DIR)/dnb.i5.xml: $(patsubst $(SRC_DIR)/%.epub,$(TARGET_DIR)/%.i5.xml,$(wildcard $(SRC_DIR)/*.epub))
head -n -1 xslt/idsCorpus-template.xml > $@
cat $^ >> $@
@@ -42,8 +43,11 @@
%.cmc.zip: %.zip
korapxml2conllu $< | pv | conllu2cmc -s | conllu2korapxml > $@
-%.krill.tar: %.zip %.ud.zip %.cmc.zip
- korapxml2krill archive --quiet -w -z -cfg krill-kokokom.cfg --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
+%.krill.tar: %.zip
+ mkdir -p $(basename $@)
+ korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg --non-word-tokens --meta I5 -i $< -o $(basename $@)
+# add annotations later
+# korapxml2krill archive --quiet -w -z -cfg krill-korap4dnb.cfg --non-word-tokens --meta I5 -i $< -i $(word 2,$^) -i $(word 3,$^) -o $(basename $@)
json: *.krill.tar
rm -rf json
diff --git a/Readme.md b/Readme.md
index 20fdd7d..e40795a 100644
--- a/Readme.md
+++ b/Readme.md
@@ -21,9 +21,12 @@
```bash
make target/dnb.spacy.zip target/dnb.tree_tagger.zip
```
+
## News
-* 2024-03-16: first working pipeline for EPub ⮕ TEI I5 ⮕ KorAP-XML ⮕ (UDPipe+TreeTagger+Spacy) ⮕ Krill ⮕ KorAP-JSON
+* 2024-03-16
+ * CI/CD pipeline added
+ * first working pipeline for EPub ⮕ TEI I5 ⮕ KorAP-XML ⮕ (UDPipe+TreeTagger+Spacy) ⮕ Krill ⮕ KorAP-JSON
* 2024-03-15: DNB test data added
diff --git a/krill-korap4dnb.cfg b/krill-korap4dnb.cfg
new file mode 100644
index 0000000..929a207
--- /dev/null
+++ b/krill-korap4dnb.cfg
@@ -0,0 +1,15 @@
+overwrite 0
+input-base .
+token Base#tokens
+base-sentences DeReKo#Structure
+base-paragraphs DeReKo#Structure
+base-pagebreaks DeReKo#Structure
+temporary-extract /tmp
+sequential-extraction 1
+to-tar 1
+jobs -1
+meta I5
+gzip 1
+log ERROR
+koral 0.4
+output .