blob: f63d0f7f0cd1e27cc27cf7767a4685e98941907b [file] [log] [blame]
Akron5af0abd2022-02-27 15:18:32 +01001FROM --platform=linux/amd64 debian:bookworm-slim
2
3WORKDIR /euralex
4
5RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt
6
7RUN apt-get update && \
8 apt-get install -y git \
Akronfc376a62022-02-28 16:55:29 +01009 wget \
10 unzip \
11 perl \
12 golang
Akron5af0abd2022-02-27 15:18:32 +010013
14############
15# Check WC #
16############
17RUN echo "WC\n" && wc -w ./example.txt
18
19
20##################
21# Install SoMaJo #
22##################
Akronfc376a62022-02-28 16:55:29 +010023RUN apt-get install -y python3-dev \
Akron5af0abd2022-02-27 15:18:32 +010024 python3 \
Akronfc376a62022-02-28 16:55:29 +010025 python3-pip && \
Akronb2b21282022-03-24 11:08:01 +010026 pip3 install SoMaJo==2.2.0
Akron5af0abd2022-02-27 15:18:32 +010027
28RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt
29
Akron5af0abd2022-02-27 15:18:32 +010030
31###################
32# Install OpenNLP #
33###################
Akron3a9b7fc2022-03-01 11:47:57 +010034RUN apt-get install -y openjdk-11-jdk
Akron5af0abd2022-02-27 15:18:32 +010035
36RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \
37 unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \
38 rm apache-opennlp-1.9.4-bin.zip && \
39 mv apache-opennlp-1.9.4 opennlp && \
40 mkdir ./opennlp/models && \
41 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \
42 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \
43 mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
44 mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
45
Akron62f57cd2022-03-01 12:05:57 +010046RUN echo "OpenNLP (1)\n" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
Akron5af0abd2022-02-27 15:18:32 +010047
Akron62f57cd2022-03-01 12:05:57 +010048RUN echo "OpenNLP (2)\n" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
Akron5af0abd2022-02-27 15:18:32 +010049
Akron62f57cd2022-03-01 12:05:57 +010050RUN echo "OpenNLP (3)\n" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
Akron5af0abd2022-02-27 15:18:32 +010051
52
53######################
54# Install TreeTagger #
55######################
56RUN mkdir ./treetagger && \
57 cd treetagger && \
58 wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
59 tar -xvzf tagger-scripts.tar.gz && \
60 rm tagger-scripts.tar.gz
61
Akron62f57cd2022-03-01 12:05:57 +010062RUN echo "TreeTagger\n" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
Akron5af0abd2022-02-27 15:18:32 +010063
64
65####################
66# Install deep-eos #
67####################
68RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \
69 unzip v0.1.zip && \
70 mv deep-eos-0.1 deep-eos && \
71 cd deep-eos && \
72 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \
73 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \
74 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \
75 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \
76 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \
77 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab
78
79RUN pip3 install --upgrade pip && \
Akronfc376a62022-02-28 16:55:29 +010080 pip3 install --upgrade tensorflow
81
82RUN pip3 install keras
83
84RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
Akron5af0abd2022-02-27 15:18:32 +010085
Akron62f57cd2022-03-01 12:05:57 +010086RUN echo "deep-eos (1)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
Akron5af0abd2022-02-27 15:18:32 +010087
Akron62f57cd2022-03-01 12:05:57 +010088RUN echo "deep-eos (2)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
Akron5af0abd2022-02-27 15:18:32 +010089
Akron62f57cd2022-03-01 12:05:57 +010090RUN echo "deep-eos (3)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
Akron5af0abd2022-02-27 15:18:32 +010091
92
Akron3a9b7fc2022-03-01 11:47:57 +010093################
94# Install JTok #
95################
96
97RUN apt-get install -y maven
98
99RUN wget https://github.com/DFKI-MLT/JTok/archive/refs/tags/v2.1.19.zip && \
100 unzip v2.1.19.zip && \
101 rm v2.1.19.zip && \
102 cd JTok-2.1.19 && \
103 JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 mvn clean package assembly:single && \
104 cd .. && \
105 unzip ./JTok-2.1.19/target/jtok-core-2.1.19-bin.zip && \
106 rm -r JTok-2.1.19 && \
107 mv jtok-core-2.1.19 JTok
108
Akron62f57cd2022-03-01 12:05:57 +0100109RUN echo "JTok\n" && \
Akron3a9b7fc2022-03-01 11:47:57 +0100110 cd ./JTok/bin && \
111 sh tokenize /euralex/example.txt de
112
113
Akronf9436252022-03-01 12:14:09 +0100114##################
115# Install Syntok #
116##################
117RUN pip3 install syntok==1.4.3
118
119RUN echo "Syntok (1)\n" && python3 -m syntok.tokenizer ./example.txt
120
121RUN echo "Syntok (2)\n" && python3 -m syntok.segmenter ./example.txt
122
123
Akron62f57cd2022-03-01 12:05:57 +0100124#################
Akronbd6bd402022-03-01 14:09:01 +0100125# Install Waste #
126#################
127RUN mkdir Waste && \
128 cd Waste && \
129 wget https://cudmuncher.de/~moocow/mirror/projects/moot/moot-2.0.20-1.tar.gz && \
130 wget https://kaskade.dwds.de/waste/waste-models/waste-data.de-dstar-tiger.tar.gz && \
131 tar -xvzf moot-2.0.20-1.tar.gz && \
132 tar -xvzf waste-data.de-dstar-tiger.tar.gz
133
134RUN cd ./Waste/moot-2.0.20-1 && \
135 ./configure && \
136 make && \
137 make install && \
138 ldconfig && \
139 echo "abbrevs /euralex/Waste/de-dstar-dtiger/abbr.lex\nstopwords /euralex/Waste/de-dstar-dtiger/stop.lex\nconjunctions /euralex/Waste/de-dstar-dtiger/conj.lex\nmodel /euralex/Waste/de-dstar-dtiger/model.hmm" > /euralex/Waste/waste.rc
140
141RUN echo "Waste\n" && cat ./example.txt | waste -N --rcfile=./Waste/waste.rc
142
143
Akron2e211f62022-03-01 15:18:36 +0100144###################
145# Install nnsplit #
146###################
147
148COPY nnsplit_bench /euralex/nnsplit_bench/
149
Akronb6efc732022-03-17 15:03:56 +0100150RUN apt-get update && apt-get install -y cargo
Akron2e211f62022-03-01 15:18:36 +0100151
152RUN cd ./nnsplit_bench && \
153 cargo build --release
154
155RUN mkdir ./nnsplit && \
156 mv ./nnsplit_bench/target/release/nnsplit_bench ./nnsplit/nnsplit_bench && \
157 rm -r ./nnsplit_bench/target
158
159RUN echo "nnsplit\n" && ./nnsplit/nnsplit_bench example.txt
160
Akronb53af0d2022-03-01 17:56:42 +0100161####################
162# Install Elephant #
163####################
164
165RUN apt-get install -y python2
166
167RUN ln -s /usr/bin/python2 /usr/bin/python
168
169RUN git clone https://github.com/erwanm/elephant-wrapper.git && \
170 cd elephant-wrapper/third-party && \
171 git clone https://github.com/ParallelMeaningBank/elephant.git && \
172 git clone https://github.com/Jekub/Wapiti.git && \
173 git clone https://github.com/mspandit/rnnlm.git
174
175RUN cd elephant-wrapper && \
176 make && \
177 make install && \
178 cd .. && \
179 mv ./elephant-wrapper/bin/elephant /usr/local/bin/ && \
180 mv ./elephant-wrapper/bin/wapiti /usr/local/bin/
181
182RUN echo "Elephant-Wrapper" && ./elephant-wrapper/bin/tokenize.sh -i example.txt UD_German
183
Akron2e211f62022-03-01 15:18:36 +0100184
Akronbd6bd402022-03-01 14:09:01 +0100185#################
Akron492a3bb2022-03-02 10:38:33 +0100186# Install SpaCy #
187#################
188
Akron2aaedc92022-03-29 16:18:19 +0200189RUN pip3 install click==8.0.4 && \
190 pip3 install -U spacy==3.2.3
Akron492a3bb2022-03-02 10:38:33 +0100191
192COPY spacy /euralex/spacy/
193
194RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt
195
Akronb6efc732022-03-17 15:03:56 +0100196# Sentence splitter
197RUN python3 -m spacy download de_core_news_sm && \
198 python3 -m spacy download de_dep_news_trf
199
Akron492a3bb2022-03-02 10:38:33 +0100200
Akronc2616422022-03-07 09:19:38 +0100201###########################
202# Install Stanford parser #
203###########################
204
205# Following https://stanfordnlp.github.io/CoreNLP/index.html
206
207RUN wget https://nlp.stanford.edu/software/stanford-corenlp-latest.zip
208
209RUN wget https://search.maven.org/remotecontent?filepath=edu/stanford/nlp/stanford-corenlp/4.4.0/stanford-corenlp-4.4.0-models-german.jar -O stanford-corenlp-4.4.0-models-german.jar
210
211RUN unzip stanford-corenlp-latest.zip && \
212 rm stanford-corenlp-latest.zip && \
213 mv stanford-corenlp-4.4.0-models-german.jar stanford-corenlp-4.4.0/
214
215# Run with threads!
216RUN echo "StanfordNLP" && \
217 CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP \
218 -annotators tokenize \
219 -tokenize.language=german \
220 -file example.txt
221
222
Akronb6efc732022-03-17 15:03:56 +0100223##################
224# Install Cutter #
225##################
226
Akron325193e2022-03-20 11:38:04 +0100227RUN pip3 install cutter-ng==2.5
Akronb6efc732022-03-17 15:03:56 +0100228
229COPY cutter /euralex/cutter/
230
231RUN echo "Cutter\n" && python3 ./cutter/cutter.py nosent example.txt
232
233
Akron325193e2022-03-20 11:38:04 +0100234#####################
235# Install BlingFire #
236#####################
237
238RUN pip3 install -U blingfire==0.1.8
239
240COPY blingfire /euralex/blingfire/
241
242RUN echo "BlingFire\n" && python3 ./blingfire/blingfire_tok.py example.txt
243
244
Akron492a3bb2022-03-02 10:38:33 +0100245#################
Akronfc376a62022-02-28 16:55:29 +0100246# Install Datok #
Akron62f57cd2022-03-01 12:05:57 +0100247#################
Akronfc376a62022-02-28 16:55:29 +0100248
Akronba783762022-03-29 16:19:38 +0200249RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.5.zip && \
250 unzip v0.1.5.zip && \
251 rm v0.1.5.zip && \
252 mv Datok-0.1.5 Datok && \
Akronfc376a62022-02-28 16:55:29 +0100253 cd Datok && \
254 go build ./cmd/datok.go
255
256RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
257
Akron62f57cd2022-03-01 12:05:57 +0100258
Akron62f57cd2022-03-01 12:05:57 +0100259###########################
260# Install KorAP-Tokenizer #
261###########################
262
263RUN mkdir KorAP-Tokenizer && \
264 cd KorAP-Tokenizer && \
265 wget https://github.com/KorAP/KorAP-Tokenizer/releases/download/v2.2.2/KorAP-Tokenizer-2.2.2-standalone.jar && \
266 mv KorAP-Tokenizer-2.2.2-standalone.jar KorAP-Tokenizer.jar
267
268RUN echo "KorAP-Tokenizer\n" && cat example.txt | java -jar KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s -
269
270
Akron492a3bb2022-03-02 10:38:33 +0100271RUN useradd -ms /bin/bash euralex
272
273RUN rm -r ./nnsplit_bench && \
274 rm /euralex/v0.1.zip
275
Akron84940752022-03-02 14:24:55 +0100276RUN chown euralex:euralex -R /euralex
Akron492a3bb2022-03-02 10:38:33 +0100277
278USER euralex
279
280WORKDIR /euralex
281
Akron84940752022-03-02 14:24:55 +0100282ENTRYPOINT [ "perl" ]
Akron5af0abd2022-02-27 15:18:32 +0100283
284LABEL maintainer="korap@ids-mannheim.de"
285LABEL description="Tokenizer evaluation for EURALEX"