Init Dockerfile
Change-Id: Ifdd6ffc9ce8acb1eb0463e748817def719e47260
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..54163b0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+/sandbox
+/Sandbox
+\#*
+*~
+.*
+!.gitignore
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..65b81d1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,107 @@
+FROM --platform=linux/amd64 debian:bookworm-slim
+
+WORKDIR /euralex
+
+RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt
+
+RUN apt-get update && \
+ apt-get install -y git \
+ perl
+
+
+############
+# Check WC #
+############
+RUN echo "WC\n" && wc -w ./example.txt
+
+
+##################
+# Install SoMaJo #
+##################
+RUN apt-get install -y \
+ python3-dev \
+ python3 \
+ python3-pip
+
+RUN pip3 install SoMaJo
+
+RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt
+
+###################
+# Install Datok #
+###################
+RUN apt-get install -y golang wget unzip && \
+ wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \
+ unzip v0.1.1.zip && \
+ rm v0.1.1.zip && \
+ mv Datok-0.1.1 Datok && \
+ cd Datok && \
+ go build ./cmd/datok.go
+
+RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
+
+
+###################
+# Install OpenNLP #
+###################
+RUN apt-get install -y openjdk-11-jre
+
+RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \
+ unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \
+ rm apache-opennlp-1.9.4-bin.zip && \
+ mv apache-opennlp-1.9.4 opennlp && \
+ mkdir ./opennlp/models && \
+ wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \
+ wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \
+ mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
+ mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
+
+RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
+
+RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
+
+RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
+
+
+######################
+# Install TreeTagger #
+######################
+RUN mkdir ./treetagger && \
+ cd treetagger && \
+ wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
+ tar -xvzf tagger-scripts.tar.gz && \
+ rm tagger-scripts.tar.gz
+
+RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
+
+
+####################
+# Install deep-eos #
+####################
+RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \
+ unzip v0.1.zip && \
+ mv deep-eos-0.1 deep-eos && \
+ cd deep-eos && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \
+ wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab
+
+RUN pip3 install --upgrade pip && \
+ pip3 install --upgrade tensorflow && \
+ pip3 install keras && \
+ sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
+
+RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
+
+RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
+
+RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
+
+
+ENTRYPOINT [ "sh" ]
+
+LABEL maintainer="korap@ids-mannheim.de"
+LABEL description="Tokenizer evaluation for EURALEX"
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..1c3d2ae
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,35 @@
+# Running the evaluation suite
+
+To build the Docker image, run
+
+```shell
+$ docker build -f Dockerfile -t korap/euralex .
+```
+This will download and install an image of approximately 4GB.
+
+It will download and install the following
+tokenizers in an image to your system:
+
+...
+
+To run the evaluation suite ...
+
+...
+
+# Licenses
+
+For Treetagger:
+Please read the [license terms](https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/Tagger-Licence),
+before you download the software!
+By downloading the software, you agree to the terms stated there.
+
+
+# Caveat
+
+When running this benchmark using Docker you may need
+to run all processes privileged to get
+[meaningful results](https://pythonspeed.com/articles/docker-performance-overhead/).
+
+```shell
+docker run --privileged -v
+```