blob: 65b81d10ee7ec6b2860b96a74957aa325ef78143 [file] [log] [blame]
Akron5af0abd2022-02-27 15:18:32 +01001FROM --platform=linux/amd64 debian:bookworm-slim
2
3WORKDIR /euralex
4
5RUN echo "Dies ist ein Test. Also, nur ein Beispiel." > example.txt
6
7RUN apt-get update && \
8 apt-get install -y git \
9 perl
10
11
12############
13# Check WC #
14############
15RUN echo "WC\n" && wc -w ./example.txt
16
17
18##################
19# Install SoMaJo #
20##################
21RUN apt-get install -y \
22 python3-dev \
23 python3 \
24 python3-pip
25
26RUN pip3 install SoMaJo
27
28RUN echo "SOMAJO\n" && somajo-tokenizer --split_sentences ./example.txt
29
30###################
31# Install Datok #
32###################
33RUN apt-get install -y golang wget unzip && \
34 wget https://github.com/KorAP/Datok/archive/refs/tags/v0.1.1.zip && \
35 unzip v0.1.1.zip && \
36 rm v0.1.1.zip && \
37 mv Datok-0.1.1 Datok && \
38 cd Datok && \
39 go build ./cmd/datok.go
40
41RUN echo "DATOK\n" && cat example.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok -
42
43
44###################
45# Install OpenNLP #
46###################
47RUN apt-get install -y openjdk-11-jre
48
49RUN wget https://dlcdn.apache.org/opennlp/opennlp-1.9.4/apache-opennlp-1.9.4-bin.zip && \
50 unzip apache-opennlp-1.9.4-bin.zip -x apache-opennlp-1.9.4/docs/* && \
51 rm apache-opennlp-1.9.4-bin.zip && \
52 mv apache-opennlp-1.9.4 opennlp && \
53 mkdir ./opennlp/models && \
54 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin && \
55 wget https://dlcdn.apache.org/opennlp/models/ud-models-1.0/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin && \
56 mv opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin ./opennlp/models/ && \
57 mv opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin ./opennlp/models/
58
59RUN echo "OpenNLP (1)" && cat example.txt | ./opennlp/bin/opennlp SimpleTokenizer
60
61RUN echo "OpenNLP (2)" && cat example.txt | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin
62
63RUN echo "OpenNLP (3)" && cat example.txt | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin
64
65
66######################
67# Install TreeTagger #
68######################
69RUN mkdir ./treetagger && \
70 cd treetagger && \
71 wget https://cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/tagger-scripts.tar.gz && \
72 tar -xvzf tagger-scripts.tar.gz && \
73 rm tagger-scripts.tar.gz
74
75RUN echo "TreeTagger" && cat example.txt | ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations
76
77
78####################
79# Install deep-eos #
80####################
81RUN wget https://github.com/dbmdz/deep-eos/archive/refs/tags/v0.1.zip && \
82 unzip v0.1.zip && \
83 mv deep-eos-0.1 deep-eos && \
84 cd deep-eos && \
85 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.model && \
86 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/cnn-de.vocab && \
87 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.model && \
88 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/bi-lstm-de.vocab && \
89 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.model && \
90 wget https://github.com/dbmdz/deep-eos/releases/download/v0.1/lstm-de.vocab
91
92RUN pip3 install --upgrade pip && \
93 pip3 install --upgrade tensorflow && \
94 pip3 install keras && \
95 sed -i 's/from keras.utils import plot_model/from tensorflow.keras.utils import plot_model/' ./deep-eos/eos.py
96
97RUN echo "deep-eos (1)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag
98
99RUN echo "deep-eos (2)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag
100
101RUN echo "deep-eos (3)" && python3 ./deep-eos/main.py --input-file example.txt --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag
102
103
104ENTRYPOINT [ "sh" ]
105
106LABEL maintainer="korap@ids-mannheim.de"
107LABEL description="Tokenizer evaluation for EURALEX"