Add KorAP-Tokenizer
Change-Id: Ifb7e615dcec0090bbd563b42677b84e5c843c3bf
diff --git a/Dockerfile b/Dockerfile
index 55e8ba3..53d3173 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -43,11 +43,11 @@
mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
-RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
+RUN echo "OpenNLP (1)\n" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
-RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
+RUN echo "OpenNLP (2)\n" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
-RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
+RUN echo "OpenNLP (3)\n" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
######################
@@ -59,7 +59,7 @@
tar -xvzf tagger-scripts.tar.gz && \
rm tagger-scripts.tar.gz
-RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
+RUN echo "TreeTagger\n" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
####################
@@ -83,11 +83,11 @@
RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
-RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
+RUN echo "deep-eos (1)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
-RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
+RUN echo "deep-eos (2)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
-RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
+RUN echo "deep-eos (3)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
################
@@ -106,14 +106,14 @@
rm -r JTok-2.1.19 && \
mv jtok-core-2.1.19 JTok
-RUN echo "JTok" && \
+RUN echo "JTok\n" && \
cd ./JTok/bin && \
sh tokenize /euralex/example.txt de
-###################
+#################
# Install Datok #
-###################
+#################
RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \
unzip v0.1.1.zip && \
@@ -124,6 +124,20 @@
RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
+
+
+###########################
+# Install KorAP-Tokenizer #
+###########################
+
+RUN mkdir KorAP-Tokenizer && \
+ cd KorAP-Tokenizer && \
+ wget https://github.com/KorAP/KorAP-Tokenizer/releases/download/v2.2.2/KorAP-Tokenizer-2.2.2-standalone.jar && \
+ mv KorAP-Tokenizer-2.2.2-standalone.jar KorAP-Tokenizer.jar
+
+RUN echo "KorAP-Tokenizer\n" && cat example.txt | java -jar KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s -
+
+
ENTRYPOINT [ "sh" ]
LABEL maintainer="korap@ids-mannheim.de"