| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 1 | FROM --platform=linux/amd64 debian:bookworm-slim | 
 | 2 |  | 
 | 3 | WORKDIR /euralex | 
 | 4 |  | 
 | 5 | RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt | 
 | 6 |  | 
 | 7 | RUN apt-get update && \ | 
 | 8 |     apt-get install -y git \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 9 |     wget \ | 
 | 10 |     unzip \ | 
 | 11 |     perl \ | 
 | 12 |     golang | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 13 |  | 
 | 14 | ############ | 
 | 15 | # Check WC # | 
 | 16 | ############ | 
 | 17 | RUN echo "WC\n" && wc -w ./example.txt | 
 | 18 |  | 
 | 19 |  | 
 | 20 | ################## | 
 | 21 | # Install SoMaJo # | 
 | 22 | ################## | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 23 | RUN apt-get install -y python3-dev \ | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 24 |     python3 \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 25 |     python3-pip && \ | 
| Akron | b2b2128 | 2022-03-24 11:08:01 +0100 | [diff] [blame] | 26 |     pip3 install SoMaJo==2.2.0 | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 27 |  | 
 | 28 | RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt | 
 | 29 |  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 30 |  | 
 | 31 | ################### | 
 | 32 | # Install OpenNLP # | 
 | 33 | ################### | 
| Akron | 3a9b7fc | 2022-03-01 11:47:57 +0100 | [diff] [blame] | 34 | RUN apt-get install -y openjdk-11-jdk | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 35 |  | 
 | 36 | RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \ | 
 | 37 |     unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \ | 
 | 38 |     rm apache-opennlp-1.9.4-bin.zip && \ | 
 | 39 |     mv apache-opennlp-1.9.4 opennlp && \ | 
 | 40 |     mkdir ./opennlp/models && \ | 
 | 41 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \ | 
 | 42 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \ | 
 | 43 |     mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \ | 
 | 44 |     mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/ | 
 | 45 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 46 | RUN echo "OpenNLP (1)\n" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 47 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 48 | RUN echo "OpenNLP (2)\n" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 49 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 50 | RUN echo "OpenNLP (3)\n" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 51 |  | 
 | 52 |  | 
 | 53 | ###################### | 
 | 54 | # Install TreeTagger # | 
 | 55 | ###################### | 
 | 56 | RUN mkdir ./treetagger && \ | 
 | 57 |     cd treetagger && \ | 
 | 58 |     wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \ | 
 | 59 |     tar -xvzf tagger-scripts.tar.gz && \ | 
 | 60 |     rm tagger-scripts.tar.gz | 
 | 61 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 62 | RUN echo "TreeTagger\n" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 63 |  | 
 | 64 |  | 
 | 65 | #################### | 
 | 66 | # Install deep-eos # | 
 | 67 | #################### | 
 | 68 | RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \ | 
 | 69 |     unzip v0.1.zip && \ | 
 | 70 |     mv deep-eos-0.1 deep-eos && \ | 
 | 71 |     cd deep-eos && \ | 
 | 72 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \ | 
 | 73 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \ | 
 | 74 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \ | 
 | 75 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \ | 
 | 76 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \ | 
 | 77 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab | 
 | 78 |  | 
 | 79 | RUN pip3 install --upgrade pip && \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 80 |     pip3 install --upgrade tensorflow | 
 | 81 |  | 
 | 82 | RUN pip3 install keras | 
 | 83 |  | 
 | 84 | RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 85 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 86 | RUN echo "deep-eos (1)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 87 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 88 | RUN echo "deep-eos (2)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 89 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 90 | RUN echo "deep-eos (3)\n" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 91 |  | 
 | 92 |  | 
| Akron | 3a9b7fc | 2022-03-01 11:47:57 +0100 | [diff] [blame] | 93 | ################ | 
 | 94 | # Install JTok # | 
 | 95 | ################ | 
 | 96 |  | 
 | 97 | RUN apt-get install -y maven | 
 | 98 |  | 
 | 99 | RUN wget https://github.com/DFKI-MLT/JTok/archive/refs/tags/v2.1.19.zip && \ | 
 | 100 |     unzip v2.1.19.zip && \ | 
 | 101 |     rm v2.1.19.zip && \ | 
 | 102 |     cd JTok-2.1.19 && \ | 
 | 103 |     JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 mvn clean package assembly:single && \ | 
 | 104 |     cd .. && \ | 
 | 105 |     unzip ./JTok-2.1.19/target/jtok-core-2.1.19-bin.zip && \ | 
 | 106 |     rm -r JTok-2.1.19 && \ | 
 | 107 |     mv jtok-core-2.1.19 JTok | 
 | 108 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 109 | RUN echo "JTok\n" && \ | 
| Akron | 3a9b7fc | 2022-03-01 11:47:57 +0100 | [diff] [blame] | 110 |     cd ./JTok/bin && \ | 
 | 111 |     sh tokenize /euralex/example.txt de | 
 | 112 |  | 
 | 113 |  | 
| Akron | f943625 | 2022-03-01 12:14:09 +0100 | [diff] [blame] | 114 | ################## | 
 | 115 | # Install Syntok # | 
 | 116 | ################## | 
 | 117 | RUN pip3 install syntok==1.4.3 | 
 | 118 |  | 
 | 119 | RUN echo "Syntok (1)\n" && python3 -m syntok.tokenizer ./example.txt | 
 | 120 |  | 
 | 121 | RUN echo "Syntok (2)\n" && python3 -m syntok.segmenter ./example.txt | 
 | 122 |  | 
 | 123 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 124 | ################# | 
| Akron | bd6bd40 | 2022-03-01 14:09:01 +0100 | [diff] [blame] | 125 | # Install Waste # | 
 | 126 | ################# | 
 | 127 | RUN mkdir Waste && \ | 
 | 128 |     cd Waste && \ | 
 | 129 |     wget https://cudmuncher.de/~moocow/mirror/projects/moot/moot-2.0.20-1.tar.gz && \ | 
 | 130 |     wget https://kaskade.dwds.de/waste/waste-models/waste-data.de-dstar-tiger.tar.gz && \ | 
 | 131 |     tar -xvzf moot-2.0.20-1.tar.gz && \ | 
 | 132 |     tar -xvzf waste-data.de-dstar-tiger.tar.gz | 
 | 133 |      | 
 | 134 | RUN cd ./Waste/moot-2.0.20-1 && \ | 
 | 135 |     ./configure && \ | 
 | 136 |     make && \ | 
 | 137 |     make install && \ | 
 | 138 |     ldconfig && \ | 
 | 139 |     echo "abbrevs /euralex/Waste/de-dstar-dtiger/abbr.lex\nstopwords /euralex/Waste/de-dstar-dtiger/stop.lex\nconjunctions /euralex/Waste/de-dstar-dtiger/conj.lex\nmodel /euralex/Waste/de-dstar-dtiger/model.hmm" > /euralex/Waste/waste.rc | 
 | 140 |  | 
 | 141 | RUN echo "Waste\n" && cat ./example.txt | waste -N --rcfile=./Waste/waste.rc | 
 | 142 |  | 
 | 143 |  | 
| Akron | 2e211f6 | 2022-03-01 15:18:36 +0100 | [diff] [blame] | 144 | ################### | 
 | 145 | # Install nnsplit # | 
 | 146 | ################### | 
 | 147 |  | 
 | 148 | COPY nnsplit_bench /euralex/nnsplit_bench/ | 
 | 149 |  | 
| Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 150 | RUN apt-get update && apt-get install -y cargo | 
| Akron | 2e211f6 | 2022-03-01 15:18:36 +0100 | [diff] [blame] | 151 |  | 
 | 152 | RUN cd ./nnsplit_bench && \ | 
 | 153 |     cargo build --release | 
 | 154 |  | 
 | 155 | RUN mkdir ./nnsplit && \ | 
 | 156 |     mv ./nnsplit_bench/target/release/nnsplit_bench ./nnsplit/nnsplit_bench && \ | 
 | 157 |     rm -r ./nnsplit_bench/target | 
 | 158 |  | 
 | 159 | RUN echo "nnsplit\n" && ./nnsplit/nnsplit_bench example.txt | 
 | 160 |  | 
| Akron | b53af0d | 2022-03-01 17:56:42 +0100 | [diff] [blame] | 161 | #################### | 
 | 162 | # Install Elephant # | 
 | 163 | #################### | 
 | 164 |  | 
 | 165 | RUN apt-get install -y python2 | 
 | 166 |  | 
 | 167 | RUN ln -s /usr/bin/python2 /usr/bin/python | 
 | 168 |  | 
 | 169 | RUN git clone https://github.com/erwanm/elephant-wrapper.git && \ | 
 | 170 |     cd elephant-wrapper/third-party && \ | 
 | 171 |     git clone https://github.com/ParallelMeaningBank/elephant.git && \ | 
 | 172 |     git clone https://github.com/Jekub/Wapiti.git && \ | 
 | 173 |     git clone https://github.com/mspandit/rnnlm.git | 
 | 174 |  | 
 | 175 | RUN cd elephant-wrapper && \ | 
 | 176 |     make && \ | 
 | 177 |     make install && \ | 
 | 178 |     cd .. && \ | 
 | 179 |     mv ./elephant-wrapper/bin/elephant /usr/local/bin/ && \ | 
 | 180 |     mv ./elephant-wrapper/bin/wapiti /usr/local/bin/ | 
 | 181 |  | 
 | 182 | RUN echo "Elephant-Wrapper" && ./elephant-wrapper/bin/tokenize.sh -i example.txt UD_German | 
 | 183 |  | 
| Akron | 2e211f6 | 2022-03-01 15:18:36 +0100 | [diff] [blame] | 184 |  | 
| Akron | bd6bd40 | 2022-03-01 14:09:01 +0100 | [diff] [blame] | 185 | ################# | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 186 | # Install SpaCy # | 
 | 187 | ################# | 
 | 188 |  | 
| Akron | 2aaedc9 | 2022-03-29 16:18:19 +0200 | [diff] [blame] | 189 | RUN pip3 install click==8.0.4 && \ | 
 | 190 |     pip3 install -U spacy==3.2.3 | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 191 |  | 
 | 192 | COPY spacy /euralex/spacy/ | 
 | 193 |  | 
 | 194 | RUN echo "SpaCy" && python3 ./spacy/spacy_tok.py example.txt | 
 | 195 |  | 
| Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 196 | # Sentence splitter | 
 | 197 | RUN python3 -m spacy download de_core_news_sm && \ | 
 | 198 |     python3 -m spacy download de_dep_news_trf | 
 | 199 |  | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 200 |  | 
| Akron | c261642 | 2022-03-07 09:19:38 +0100 | [diff] [blame] | 201 | ########################### | 
 | 202 | # Install Stanford parser # | 
 | 203 | ########################### | 
 | 204 |  | 
 | 205 | # Following https://stanfordnlp.github.io/CoreNLP/index.html | 
 | 206 |  | 
 | 207 | RUN wget https://nlp.stanford.edu/software/stanford-corenlp-latest.zip | 
 | 208 |  | 
 | 209 | RUN wget https://search.maven.org/remotecontent?filepath=edu/stanford/nlp/stanford-corenlp/4.4.0/stanford-corenlp-4.4.0-models-german.jar -O stanford-corenlp-4.4.0-models-german.jar | 
 | 210 |  | 
 | 211 | RUN unzip stanford-corenlp-latest.zip && \ | 
 | 212 |     rm stanford-corenlp-latest.zip && \ | 
 | 213 |     mv stanford-corenlp-4.4.0-models-german.jar stanford-corenlp-4.4.0/ | 
 | 214 |  | 
 | 215 | # Run with threads! | 
 | 216 | RUN echo "StanfordNLP" && \ | 
 | 217 |     CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP \ | 
 | 218 |     -annotators tokenize \ | 
 | 219 |     -tokenize.language=german \ | 
 | 220 |     -file example.txt | 
 | 221 |  | 
 | 222 |  | 
| Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 223 | ################## | 
 | 224 | # Install Cutter # | 
 | 225 | ################## | 
 | 226 |  | 
| Akron | 325193e | 2022-03-20 11:38:04 +0100 | [diff] [blame] | 227 | RUN pip3 install cutter-ng==2.5 | 
| Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 228 |  | 
 | 229 | COPY cutter /euralex/cutter/ | 
 | 230 |  | 
 | 231 | RUN echo "Cutter\n" && python3 ./cutter/cutter.py nosent example.txt | 
 | 232 |  | 
 | 233 |  | 
| Akron | 325193e | 2022-03-20 11:38:04 +0100 | [diff] [blame] | 234 | ##################### | 
 | 235 | # Install BlingFire # | 
 | 236 | ##################### | 
 | 237 |  | 
 | 238 | RUN pip3 install -U blingfire==0.1.8 | 
 | 239 |  | 
 | 240 | COPY blingfire /euralex/blingfire/ | 
 | 241 |  | 
 | 242 | RUN echo "BlingFire\n" && python3 ./blingfire/blingfire_tok.py example.txt | 
 | 243 |  | 
 | 244 |  | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 245 | ################# | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 246 | # Install Datok # | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 247 | ################# | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 248 |  | 
| Akron | ba78376 | 2022-03-29 16:19:38 +0200 | [diff] [blame^] | 249 | RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.5.zip && \ | 
 | 250 |     unzip v0.1.5.zip && \ | 
 | 251 |     rm v0.1.5.zip && \ | 
 | 252 |     mv Datok-0.1.5 Datok && \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 253 |     cd Datok && \ | 
 | 254 |     go build ./cmd/datok.go | 
 | 255 |  | 
 | 256 | RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | 
 | 257 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 258 |  | 
| Akron | 62f57cd | 2022-03-01 12:05:57 +0100 | [diff] [blame] | 259 | ########################### | 
 | 260 | # Install KorAP-Tokenizer # | 
 | 261 | ########################### | 
 | 262 |  | 
 | 263 | RUN mkdir KorAP-Tokenizer && \ | 
 | 264 |     cd KorAP-Tokenizer && \ | 
 | 265 |     wget https://github.com/KorAP/KorAP-Tokenizer/releases/download/v2.2.2/KorAP-Tokenizer-2.2.2-standalone.jar && \ | 
 | 266 |     mv KorAP-Tokenizer-2.2.2-standalone.jar KorAP-Tokenizer.jar | 
 | 267 |  | 
 | 268 | RUN echo "KorAP-Tokenizer\n" && cat example.txt | java -jar KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s - | 
 | 269 |  | 
 | 270 |  | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 271 | RUN useradd -ms /bin/bash euralex | 
 | 272 |  | 
 | 273 | RUN rm -r ./nnsplit_bench && \ | 
 | 274 |     rm /euralex/v0.1.zip | 
 | 275 |  | 
| Akron | 8494075 | 2022-03-02 14:24:55 +0100 | [diff] [blame] | 276 | RUN chown euralex:euralex -R /euralex | 
| Akron | 492a3bb | 2022-03-02 10:38:33 +0100 | [diff] [blame] | 277 |  | 
 | 278 | USER euralex | 
 | 279 |  | 
 | 280 | WORKDIR /euralex | 
 | 281 |  | 
| Akron | 8494075 | 2022-03-02 14:24:55 +0100 | [diff] [blame] | 282 | ENTRYPOINT [ "perl" ] | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 283 |  | 
 | 284 | LABEL maintainer="korap@ids-mannheim.de" | 
 | 285 | LABEL description="Tokenizer evaluation for EURALEX" |