| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 1 | FROM --platform=linux/amd64 debian:bookworm-slim | 
 | 2 |  | 
 | 3 | WORKDIR /euralex | 
 | 4 |  | 
 | 5 | RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt | 
 | 6 |  | 
 | 7 | RUN apt-get update && \ | 
 | 8 |     apt-get install -y git \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame^] | 9 |     wget \ | 
 | 10 |     unzip \ | 
 | 11 |     perl \ | 
 | 12 |     golang | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 13 |  | 
 | 14 | ############ | 
 | 15 | # Check WC # | 
 | 16 | ############ | 
 | 17 | RUN echo "WC\n" && wc -w ./example.txt | 
 | 18 |  | 
 | 19 |  | 
 | 20 | ################## | 
 | 21 | # Install SoMaJo # | 
 | 22 | ################## | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame^] | 23 | RUN apt-get install -y python3-dev \ | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 24 |     python3 \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame^] | 25 |     python3-pip && \ | 
 | 26 |     pip3 install SoMaJo | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 27 |  | 
 | 28 | RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt | 
 | 29 |  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 30 |  | 
 | 31 | ################### | 
 | 32 | # Install OpenNLP # | 
 | 33 | ################### | 
 | 34 | RUN apt-get install -y openjdk-11-jre | 
 | 35 |  | 
 | 36 | RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \ | 
 | 37 |     unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \ | 
 | 38 |     rm apache-opennlp-1.9.4-bin.zip && \ | 
 | 39 |     mv apache-opennlp-1.9.4 opennlp && \ | 
 | 40 |     mkdir ./opennlp/models && \ | 
 | 41 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \ | 
 | 42 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \ | 
 | 43 |     mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \ | 
 | 44 |     mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/ | 
 | 45 |  | 
 | 46 | RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer  | 
 | 47 |  | 
 | 48 | RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin | 
 | 49 |  | 
 | 50 | RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin | 
 | 51 |  | 
 | 52 |  | 
 | 53 | ###################### | 
 | 54 | # Install TreeTagger # | 
 | 55 | ###################### | 
 | 56 | RUN mkdir ./treetagger && \ | 
 | 57 |     cd treetagger && \ | 
 | 58 |     wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \ | 
 | 59 |     tar -xvzf tagger-scripts.tar.gz && \ | 
 | 60 |     rm tagger-scripts.tar.gz | 
 | 61 |  | 
 | 62 | RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations | 
 | 63 |  | 
 | 64 |  | 
 | 65 | #################### | 
 | 66 | # Install deep-eos # | 
 | 67 | #################### | 
 | 68 | RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \ | 
 | 69 |     unzip v0.1.zip && \ | 
 | 70 |     mv deep-eos-0.1 deep-eos && \ | 
 | 71 |     cd deep-eos && \ | 
 | 72 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \ | 
 | 73 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \ | 
 | 74 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \ | 
 | 75 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \ | 
 | 76 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \ | 
 | 77 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab | 
 | 78 |  | 
 | 79 | RUN pip3 install --upgrade pip && \ | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame^] | 80 |     pip3 install --upgrade tensorflow | 
 | 81 |  | 
 | 82 | RUN pip3 install keras | 
 | 83 |  | 
 | 84 | RUN sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 85 |  | 
 | 86 | RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag | 
 | 87 |  | 
 | 88 | RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag | 
 | 89 |  | 
 | 90 | RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag | 
 | 91 |  | 
 | 92 |  | 
| Akron | fc376a6 | 2022-02-28 16:55:29 +0100 | [diff] [blame^] | 93 | ################### | 
 | 94 | # Install Datok # | 
 | 95 | ################### | 
 | 96 |  | 
 | 97 | RUN wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \ | 
 | 98 |     unzip v0.1.1.zip && \ | 
 | 99 |     rm v0.1.1.zip && \ | 
 | 100 |     mv Datok-0.1.1 Datok && \ | 
 | 101 |     cd Datok && \ | 
 | 102 |     go build ./cmd/datok.go | 
 | 103 |  | 
 | 104 | RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | 
 | 105 |  | 
| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame] | 106 | ENTRYPOINT [ "sh" ] | 
 | 107 |  | 
 | 108 | LABEL maintainer="korap@ids-mannheim.de" | 
 | 109 | LABEL description="Tokenizer evaluation for EURALEX" |