| Akron | 5af0abd | 2022-02-27 15:18:32 +0100 | [diff] [blame^] | 1 | FROM --platform=linux/amd64 debian:bookworm-slim | 
 | 2 |  | 
 | 3 | WORKDIR /euralex | 
 | 4 |  | 
 | 5 | RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt | 
 | 6 |  | 
 | 7 | RUN apt-get update && \ | 
 | 8 |     apt-get install -y git \ | 
 | 9 |     perl | 
 | 10 |  | 
 | 11 |  | 
 | 12 | ############ | 
 | 13 | # Check WC # | 
 | 14 | ############ | 
 | 15 | RUN echo "WC\n" && wc -w ./example.txt | 
 | 16 |  | 
 | 17 |  | 
 | 18 | ################## | 
 | 19 | # Install SoMaJo # | 
 | 20 | ################## | 
 | 21 | RUN apt-get install -y \ | 
 | 22 |     python3-dev \ | 
 | 23 |     python3 \ | 
 | 24 |     python3-pip | 
 | 25 |  | 
 | 26 | RUN pip3 install SoMaJo | 
 | 27 |  | 
 | 28 | RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt | 
 | 29 |  | 
 | 30 | ################### | 
 | 31 | # Install Datok # | 
 | 32 | ################### | 
 | 33 | RUN apt-get install -y golang wget unzip && \ | 
 | 34 |     wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \ | 
 | 35 |     unzip v0.1.1.zip && \ | 
 | 36 |     rm v0.1.1.zip && \ | 
 | 37 |     mv Datok-0.1.1 Datok && \ | 
 | 38 |     cd Datok && \ | 
 | 39 |     go build ./cmd/datok.go | 
 | 40 |  | 
 | 41 | RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - | 
 | 42 |  | 
 | 43 |  | 
 | 44 | ################### | 
 | 45 | # Install OpenNLP # | 
 | 46 | ################### | 
 | 47 | RUN apt-get install -y openjdk-11-jre | 
 | 48 |  | 
 | 49 | RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \ | 
 | 50 |     unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \ | 
 | 51 |     rm apache-opennlp-1.9.4-bin.zip && \ | 
 | 52 |     mv apache-opennlp-1.9.4 opennlp && \ | 
 | 53 |     mkdir ./opennlp/models && \ | 
 | 54 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \ | 
 | 55 |     wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \ | 
 | 56 |     mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \ | 
 | 57 |     mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/ | 
 | 58 |  | 
 | 59 | RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer  | 
 | 60 |  | 
 | 61 | RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin | 
 | 62 |  | 
 | 63 | RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin | 
 | 64 |  | 
 | 65 |  | 
 | 66 | ###################### | 
 | 67 | # Install TreeTagger # | 
 | 68 | ###################### | 
 | 69 | RUN mkdir ./treetagger && \ | 
 | 70 |     cd treetagger && \ | 
 | 71 |     wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \ | 
 | 72 |     tar -xvzf tagger-scripts.tar.gz && \ | 
 | 73 |     rm tagger-scripts.tar.gz | 
 | 74 |  | 
 | 75 | RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations | 
 | 76 |  | 
 | 77 |  | 
 | 78 | #################### | 
 | 79 | # Install deep-eos # | 
 | 80 | #################### | 
 | 81 | RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \ | 
 | 82 |     unzip v0.1.zip && \ | 
 | 83 |     mv deep-eos-0.1 deep-eos && \ | 
 | 84 |     cd deep-eos && \ | 
 | 85 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \ | 
 | 86 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \ | 
 | 87 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \ | 
 | 88 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \ | 
 | 89 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \ | 
 | 90 |     wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab | 
 | 91 |  | 
 | 92 | RUN pip3 install --upgrade pip && \ | 
 | 93 |     pip3 install --upgrade tensorflow && \ | 
 | 94 |     pip3 install keras && \ | 
 | 95 |     sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py | 
 | 96 |  | 
 | 97 | RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag | 
 | 98 |  | 
 | 99 | RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag | 
 | 100 |  | 
 | 101 | RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag | 
 | 102 |  | 
 | 103 |  | 
 | 104 | ENTRYPOINT [ "sh" ] | 
 | 105 |  | 
 | 106 | LABEL maintainer="korap@ids-mannheim.de" | 
 | 107 | LABEL description="Tokenizer evaluation for EURALEX" |