Init Dockerfile Change-Id: Ifdd6ffc9ce8acb1eb0463e748817def719e47260

commit: 5af0abd004499bbc22e23d0ebbada44374b9bb88 [log] [tgz]
author: Akron <nils@diewald-online.de> Sun Feb 27 15:18:32 2022 +0100
committer: Akron <nils@diewald-online.de> Sun Feb 27 15:18:32 2022 +0100
tree: 41e95166bcb9f054b3c296f418b0a1becaa80463
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..54163b0
--- /dev/null
+++ b/.gitignore

@@ -0,0 +1,6 @@
+/sandbox
+/Sandbox
+\#*
+*~
+.*
+!.gitignore

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..65b81d1
--- /dev/null
+++ b/Dockerfile

@@ -0,0 +1,107 @@
+FROM --platform=linux/amd64 debian:bookworm-slim
+
+WORKDIR /euralex
+
+RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt
+
+RUN apt-get update && \
+    apt-get install -y git \
+    perl
+
+
+############
+# Check WC #
+############
+RUN echo "WC\n" && wc -w ./example.txt
+
+
+##################
+# Install SoMaJo #
+##################
+RUN apt-get install -y \
+    python3-dev \
+    python3 \
+    python3-pip
+
+RUN pip3 install SoMaJo
+
+RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt
+
+###################
+# Install Datok #
+###################
+RUN apt-get install -y golang wget unzip && \
+    wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \
+    unzip v0.1.1.zip && \
+    rm v0.1.1.zip && \
+    mv Datok-0.1.1 Datok && \
+    cd Datok && \
+    go build ./cmd/datok.go
+
+RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
+
+
+###################
+# Install OpenNLP #
+###################
+RUN apt-get install -y openjdk-11-jre
+
+RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \
+    unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \
+    rm apache-opennlp-1.9.4-bin.zip && \
+    mv apache-opennlp-1.9.4 opennlp && \
+    mkdir ./opennlp/models && \
+    wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \
+    wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \
+    mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
+    mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
+
+RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer 
+
+RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
+
+RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
+
+
+######################
+# Install TreeTagger #
+######################
+RUN mkdir ./treetagger && \
+    cd treetagger && \
+    wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
+    tar -xvzf tagger-scripts.tar.gz && \
+    rm tagger-scripts.tar.gz
+
+RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
+
+
+####################
+# Install deep-eos #
+####################
+RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \
+    unzip v0.1.zip && \
+    mv deep-eos-0.1 deep-eos && \
+    cd deep-eos && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \
+    wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab
+
+RUN pip3 install --upgrade pip && \
+    pip3 install --upgrade tensorflow && \
+    pip3 install keras && \
+    sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
+
+RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
+
+RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
+
+RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
+
+
+ENTRYPOINT [ "sh" ]
+
+LABEL maintainer="korap@ids-mannheim.de"
+LABEL description="Tokenizer evaluation for EURALEX"
\ No newline at end of file

diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..1c3d2ae
--- /dev/null
+++ b/Readme.md

@@ -0,0 +1,35 @@
+# Running the evaluation suite
+
+To build the Docker image, run
+
+```shell
+$ docker build -f Dockerfile -t korap/euralex .
+```
+This will download and install an image of approximately 4GB.
+
+It will download and install the following
+tokenizers in an image to your system:
+
+...
+
+To run the evaluation suite ...
+
+...
+
+# Licenses
+
+For Treetagger:
+Please read the [license terms](https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/Tagger-Licence),
+before you download the software!
+By downloading the software, you agree to the terms stated there. 
+
+
+# Caveat
+
+When running this benchmark using Docker you may need
+to run all processes privileged to get
+[meaningful results](https://pythonspeed.com/articles/docker-performance-overhead/).
+
+```shell
+docker run --privileged -v
+```
commit	5af0abd004499bbc22e23d0ebbada44374b9bb88	[log] [tgz]
author	Akron <nils@diewald-online.de>	Sun Feb 27 15:18:32 2022 +0100
committer	Akron <nils@diewald-online.de>	Sun Feb 27 15:18:32 2022 +0100
tree	41e95166bcb9f054b3c296f418b0a1becaa80463