blob: 55e8ba3b2a0499131d1805e5aa9dd1805ddab2de [file] [log] [blame]
Akron5af0abd2022-02-27 15:18:32 +01001FROM --platform=linux/amd64 debian:bookworm-slim
2
3WORKDIR /euralex
4
5RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt
6
7RUN apt-get update && \
8 apt-get install -y git \
Akronfc376a62022-02-28 16:55:29 +01009 wget \
10 unzip \
11 perl \
12 golang
Akron5af0abd2022-02-27 15:18:32 +010013
14############
15# Check WC #
16############
17RUN echo "WC\n" && wc -w ./example.txt
18
19
20##################
21# Install SoMaJo #
22##################
Akronfc376a62022-02-28 16:55:29 +010023RUN apt-get install -y python3-dev \
Akron5af0abd2022-02-27 15:18:32 +010024 python3 \
Akronfc376a62022-02-28 16:55:29 +010025 python3-pip && \
26 pip3 install SoMaJo
Akron5af0abd2022-02-27 15:18:32 +010027
28RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt
29
Akron5af0abd2022-02-27 15:18:32 +010030
31###################
32# Install OpenNLP #
33###################
Akron3a9b7fc2022-03-01 11:47:57 +010034RUN apt-get install -y openjdk-11-jdk
Akron5af0abd2022-02-27 15:18:32 +010035
36RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \
37 unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \
38 rm apache-opennlp-1.9.4-bin.zip && \
39 mv apache-opennlp-1.9.4 opennlp && \
40 mkdir ./opennlp/models && \
41 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \
42 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \
43 mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
44 mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
45
46RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
47
48RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
49
50RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
51
52
53######################
54# Install TreeTagger #
55######################
56RUN mkdir ./treetagger && \
57 cd treetagger && \
58 wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
59 tar -xvzf tagger-scripts.tar.gz && \
60 rm tagger-scripts.tar.gz
61
62RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
63
64
65####################
66# Install deep-eos #
67####################
68RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \
69 unzip v0.1.zip && \
70 mv deep-eos-0.1 deep-eos && \
71 cd deep-eos && \
72 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \
73 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \
74 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \
75 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \
76 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \
77 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab
78
79RUN pip3 install --upgrade pip && \
Akronfc376a62022-02-28 16:55:29 +010080 pip3 install --upgrade tensorflow
81
82RUN pip3 install keras
83
84RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
Akron5af0abd2022-02-27 15:18:32 +010085
86RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
87
88RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
89
90RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
91
92
Akron3a9b7fc2022-03-01 11:47:57 +010093################
94# Install JTok #
95################
96
97RUN apt-get install -y maven
98
99RUN wget https://github.com/DFKI-MLT/JTok/archive/refs/tags/v2.1.19.zip && \
100 unzip v2.1.19.zip && \
101 rm v2.1.19.zip && \
102 cd JTok-2.1.19 && \
103 JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 mvn clean package assembly:single && \
104 cd .. && \
105 unzip ./JTok-2.1.19/target/jtok-core-2.1.19-bin.zip && \
106 rm -r JTok-2.1.19 && \
107 mv jtok-core-2.1.19 JTok
108
109RUN echo "JTok" && \
110 cd ./JTok/bin && \
111 sh tokenize /euralex/example.txt de
112
113
Akronfc376a62022-02-28 16:55:29 +0100114###################
115# Install Datok #
116###################
117
118RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \
119 unzip v0.1.1.zip && \
120 rm v0.1.1.zip && \
121 mv Datok-0.1.1 Datok && \
122 cd Datok && \
123 go build ./cmd/datok.go
124
125RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
126
Akron5af0abd2022-02-27 15:18:32 +0100127ENTRYPOINT [ "sh" ]
128
129LABEL maintainer="korap@ids-mannheim.de"
130LABEL description="Tokenizer evaluation for EURALEX"