| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 1 | FROM --platform=linux/amd64 debian:bookworm-slim | 
 | 2 |  | 
 | 3 | WORKDIR /euralex | 
 | 4 |  | 
 | 5 | RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt | 
 | 6 |  | 
 | 7 | RUN apt-get update && \ | 
 | 8 |     apt-get install -y git \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 9 |     wget \ | 
 | 10 |     unzip \ | 
 | 11 |     perl \ | 
 | 12 |     golang | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 13 |  | 
 | 14 | ############ | 
 | 15 | # Check WC # | 
 | 16 | ############ | 
 | 17 | RUN echo "WC\n" && wc -w ./example.txt | 
 | 18 |  | 
 | 19 |  | 
 | 20 | ################## | 
 | 21 | # Install SoMaJo # | 
 | 22 | ################## | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 23 | RUN apt-get install -y python3-dev \ | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 24 |     python3 \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 25 |     python3-pip && \ | 
 | 26 |     pip3 install SoMaJo | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 27 |  | 
 | 28 | RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt | 
 | 29 |  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 30 |  | 
 | 31 | ################### | 
 | 32 | # Install OpenNLP # | 
 | 33 | ################### | 
| Akron | 3a9b7fc | 2022-03-01 11:47:57 +0100 | [diff] [blame^] | 34 | RUN apt-get install -y openjdk-11-jdk | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 35 |  | 
 | 36 | RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \ | 
 | 37 |     unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \ | 
 | 38 |     rm apache-opennlp-1.9.4-bin.zip && \ | 
 | 39 |     mv apache-opennlp-1.9.4 opennlp && \ | 
 | 40 |     mkdir ./opennlp/models && \ | 
 | 41 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \ | 
 | 42 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \ | 
 | 43 |     mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \ | 
 | 44 |     mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/ | 
 | 45 |  | 
 | 46 | RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer  | 
 | 47 |  | 
 | 48 | RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin | 
 | 49 |  | 
 | 50 | RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin | 
 | 51 |  | 
 | 52 |  | 
 | 53 | ###################### | 
 | 54 | # Install TreeTagger # | 
 | 55 | ###################### | 
 | 56 | RUN mkdir ./treetagger && \ | 
 | 57 |     cd treetagger && \ | 
 | 58 |     wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \ | 
 | 59 |     tar -xvzf tagger-scripts.tar.gz && \ | 
 | 60 |     rm tagger-scripts.tar.gz | 
 | 61 |  | 
 | 62 | RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations | 
 | 63 |  | 
 | 64 |  | 
 | 65 | #################### | 
 | 66 | # Install deep-eos # | 
 | 67 | #################### | 
 | 68 | RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \ | 
 | 69 |     unzip v0.1.zip && \ | 
 | 70 |     mv deep-eos-0.1 deep-eos && \ | 
 | 71 |     cd deep-eos && \ | 
 | 72 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \ | 
 | 73 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \ | 
 | 74 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \ | 
 | 75 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \ | 
 | 76 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \ | 
 | 77 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab | 
 | 78 |  | 
 | 79 | RUN pip3 install --upgrade pip && \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 80 |     pip3 install --upgrade tensorflow | 
 | 81 |  | 
 | 82 | RUN pip3 install keras | 
 | 83 |  | 
 | 84 | RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 85 |  | 
 | 86 | RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag | 
 | 87 |  | 
 | 88 | RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag | 
 | 89 |  | 
 | 90 | RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag | 
 | 91 |  | 
 | 92 |  | 
| Akron | 3a9b7fc | 2022-03-01 11:47:57 +0100 | [diff] [blame^] | 93 | ################ | 
 | 94 | # Install JTok # | 
 | 95 | ################ | 
 | 96 |  | 
 | 97 | RUN apt-get install -y maven | 
 | 98 |  | 
 | 99 | RUN wget https://github.com/DFKI-MLT/JTok/archive/refs/tags/v2.1.19.zip && \ | 
 | 100 |     unzip v2.1.19.zip && \ | 
 | 101 |     rm v2.1.19.zip && \ | 
 | 102 |     cd JTok-2.1.19 && \ | 
 | 103 |     JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 mvn clean package assembly:single && \ | 
 | 104 |     cd .. && \ | 
 | 105 |     unzip ./JTok-2.1.19/target/jtok-core-2.1.19-bin.zip && \ | 
 | 106 |     rm -r JTok-2.1.19 && \ | 
 | 107 |     mv jtok-core-2.1.19 JTok | 
 | 108 |  | 
 | 109 | RUN echo "JTok" && \ | 
 | 110 |     cd ./JTok/bin && \ | 
 | 111 |     sh tokenize /euralex/example.txt de | 
 | 112 |  | 
 | 113 |  | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame] | 114 | ################### | 
 | 115 | # Install Datok # | 
 | 116 | ################### | 
 | 117 |  | 
 | 118 | RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \ | 
 | 119 |     unzip v0.1.1.zip && \ | 
 | 120 |     rm v0.1.1.zip && \ | 
 | 121 |     mv Datok-0.1.1 Datok && \ | 
 | 122 |     cd Datok && \ | 
 | 123 |     go build ./cmd/datok.go | 
 | 124 |  | 
 | 125 | RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | 
 | 126 |  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 127 | ENTRYPOINT [ "sh" ] | 
 | 128 |  | 
 | 129 | LABEL maintainer="korap@ids-mannheim.de" | 
 | 130 | LABEL description="Tokenizer evaluation for EURALEX" |