blob: ba6c50c13cd9cf5dc20307fc3aba7e8ac5323721 [file] [log] [blame]
Akrona6b45972022-04-16 01:51:47 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Benchmark qw!:hireswallclock :all!;
5use Data::Dumper;
6use POSIX 'round';
7
8our @ARGV;
9
10# The first parameter is the number of iterations.
11
12my $FILE = 'effi-1x-utf8.txt';
13system 'gzip -dkf ./corpus/' . $FILE . '.gz';
14my $iter = 1;
15
16if ($ARGV[0]) {
17 $iter = $ARGV[0] + 0;
18};
19
20my $temp_dir = '/euralex/batches/';
21mkdir($temp_dir);
22
23# Read file
24my $single = '';
25if (open(my $f, '<', './corpus/' . $FILE)) {
26 while(!eof($f)) {
27 $single .= <$f>;
28 };
29 close($f);
30};
31
32warn "Concatenate file\n";
33
34my $count = 300;
35
36# Concat $count times
37my $data = '';
38foreach (1..$count) {
39 $data .= $single;
40};
41
42# save file in docker
43open(X, '>', $temp_dir . 'effi-' . $count . 'x-utf8.txt') or die $!;
44print X $data;
45close(X);
46
47warn "Calculate offsets\n";
48
49# Calculate offsets with datok (just because) for segmenting
50my $offsets = `cat ${temp_dir}effi-${count}x-utf8.txt | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok --no-tokens --no-sentences -p -`;
51my @offsets = split / /, $offsets;
52
53unlink './corpus/effi-' . $count . 'x-utf8.txt';
54
55$FILE = undef;
56
57my $models = {
58 'wc' => sub {
59 system 'wc -w ./corpus/'.$FILE.' > /dev/null';
60 },
61 'SoMaJo' => sub {
62 system 'somajo-tokenizer '.$temp_dir.$FILE.' --split_sentences > /dev/null';
63 },
64 'SoMaJo_p2' => sub {
65 system 'somajo-tokenizer '.$temp_dir.$FILE.' --parallel=2 --split_sentences > /dev/null';
66 },
67 'SoMaJo_p4' => sub {
68 system 'somajo-tokenizer '.$temp_dir.$FILE.' --parallel=4 --split_sentences > /dev/null';
69 },
70 'SoMaJo_p8' => sub {
71 system 'somajo-tokenizer '.$temp_dir.$FILE.' --parallel=8 --split_sentences > /dev/null';
72 },
73 'Datok_matok' => sub {
74 system 'cat '.$temp_dir.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - > /dev/null'
75 },
76 'Datok_datok' => sub {
77 system 'cat '.$temp_dir.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.datok - > /dev/null'
78 },
79 'OpenNLP_Simple' => sub {
80 system 'cat '.$temp_dir.$FILE.' | ./opennlp/bin/opennlp SimpleTokenizer > /dev/null';
81 },
82 'OpenNLP_Tokenizer_de-ud-gsd' => sub {
83 system 'cat '.$temp_dir.$FILE.' | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin > /dev/null';
84 },
85 'OpenNLP_Sentence_de-ud-gsd' => sub {
86 system 'cat '.$temp_dir.$FILE.' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin > /dev/null';
87 },
88 'TreeTagger' => sub {
89 system 'cat '.$temp_dir.$FILE.' | perl ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations > /dev/null';
90 },
91 'deep-eos_bi-lstm-de' => sub {
92 system 'python3 ./deep-eos/main.py --input-file '.$temp_dir.$FILE.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag > /dev/null';
93 },
94 'deep-eos_cnn-de' => sub {
95 system 'python3 ./deep-eos/main.py --input-file '.$temp_dir.$FILE.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag > /dev/null';
96 },
97 'deep-eos_lstm-de' => sub {
98 system 'python3 ./deep-eos/main.py --input-file '.$temp_dir.$FILE.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag > /dev/null';
99 },
100 'JTok' => sub {
101 chdir '/euralex/JTok/bin';
102 system 'sh tokenize '.$temp_dir.$FILE.' de > /dev/null';
103 chdir '/euralex';
104 },
105 'KorAP-Tokenizer' => sub {
106 system 'cat '.$temp_dir.$FILE.' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s > /dev/null'
107 },
108 Syntok_tokenizer => sub {
109 system 'python3 -m syntok.tokenizer '.$temp_dir.$FILE.' > /dev/null';
110 },
111 Syntok_segmenter => sub {
112 system 'python3 -m syntok.segmenter '.$temp_dir.$FILE.' > /dev/null';
113 },
114 Waste => sub {
115 system 'cat '.$temp_dir.$FILE.' | waste -N -v0 --rcfile=./Waste/waste.rc > /dev/null';
116 },
117 nnsplit => sub {
118 system './nnsplit/nnsplit_bench '.$temp_dir.$FILE.' > /dev/null'
119 },
120 elephant => sub {
121 system './elephant-wrapper/bin/tokenize.sh -i '.$temp_dir.$FILE.' UD_German > /dev/null'
122 },
123 cutter => sub {
124 system 'python3 ./cutter/cutter.py nosent '.$temp_dir.$FILE.' > /dev/null'
125 },
126 blingfire_tok => sub {
127 system 'python3 ./blingfire/blingfire_tok.py '.$temp_dir.$FILE.' > /dev/null'
128 },
129 blingfire_sent => sub {
130 system 'python3 ./blingfire/blingfire_sent.py '.$temp_dir.$FILE.' > /dev/null'
131 },
132 spacy_tok => sub {
133 system 'python3 ./spacy/spacy_tok.py '.$temp_dir.$FILE.' > /dev/null'
134 },
135 spacy_dep => sub {
136 system 'python3 ./spacy/spacy_sent.py dep '.$temp_dir.$FILE.' > /dev/null'
137 },
138 spacy_stat => sub {
139 system 'python3 ./spacy/spacy_sent.py stat '.$temp_dir.$FILE.' > /dev/null'
140 },
141 spacy_sentencizer => sub {
142 system 'python3 ./spacy/spacy_sent.py sentencizer '.$temp_dir.$FILE.' > /dev/null'
143 },
144 Stanford => sub {
145 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
146 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file '.$temp_dir . $FILE
147 },
148 Stanford_t2 => sub {
149 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
150 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=2 -file '.$temp_dir . $FILE
151 },
152 Stanford_t4 => sub {
153 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
154 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=4 -file '.$temp_dir . $FILE
155 },
156 Stanford_t8 => sub {
157 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
158 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=8 -file '.$temp_dir . $FILE
159 },
160 Stanford_tokonly => sub {
161 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
162 '-props german -annotators tokenize -tokenize.language=german -file '.$temp_dir . $FILE;
163 unlink $temp_dir . $FILE . '.out';
164 },
165};
166
167#delete $models->{'wc'};
168#delete $models->{'SoMaJo'};
169#delete $models->{'SoMaJo_p2'};
170#delete $models->{'SoMaJo_p4'};
171#delete $models->{'SoMaJo_p8'};
172#delete $models->{'Datok_matok'};
173#delete $models->{'Datok_datok'};
174#delete $models->{'OpenNLP_Simple'};
175#delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
176#delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
177#delete $models->{'TreeTagger'};
178#delete $models->{'deep-eos_bi-lstm-de'};
179#delete $models->{'deep-eos_cnn-de'};
180#delete $models->{'deep-eos_lstm-de'};
181#delete $models->{'JTok'};
182#delete $models->{'KorAP-Tokenizer'};
183#delete $models->{'Syntok_tokenizer'};
184#delete $models->{'Syntok_segmenter'};
185#delete $models->{'Waste'};
186#delete $models->{'nnsplit'};
187#delete $models->{'elephant'};
188#delete $models->{'Stanford'};
189#delete $models->{'Stanford_t2'};
190#delete $models->{'Stanford_t4'};
191#delete $models->{'Stanford_t8'};
192#delete $models->{'Stanford_tokonly'};
193#delete $models->{'cutter'};
194#delete $models->{'spacy_tok'};
195#delete $models->{'spacy_sentencizer'};
196#delete $models->{'spacy_dep'};
197#delete $models->{'spacy_stat'};
198#delete $models->{'blingfire_tok'};
199#delete $models->{'blingfire_sent'};
200
201
202my $t0 = Benchmark->new;
203
204# Get some batch files with
205# 1000 tokens
206# 2000 tokens
207# 4000 tokens
208# 8000 tokens
209# ...
210# 8192000 tokens
211for (my $x = 0 ;$x < 100;$x++) {
212 my $i = (2**$x) * 1000;
213
214 warn "Create $i batch\n";
215
216 last if ($i*2)+1 > scalar(@offsets);
217
218 my $off = $offsets[($i*2)+1];
219 last unless $off;
220
221 $FILE = 'effi-batch-' . $i . '.txt';
222
223 open(Y, '>', $temp_dir . $FILE);
224 print Y substr($data, 0, $off);
225 close(Y);
226
227 my $cmp = timethese($iter => $models);
228
229 print "\n----------------------------------\n";
230
231 foreach my $tool (sort keys %$cmp) {
232 my $seconds_per_run = $cmp->{$tool}->[0] / $cmp->{$tool}->[5];
233 my $tokens_per_msecond = ($i / $seconds_per_run) / 1000;
234 print $tool,"\t",$i,"\t", $seconds_per_run, "\t", sprintf("%.2f", $seconds_per_run), "\t", $tokens_per_msecond, "\t", sprintf("%.2f", $tokens_per_msecond), "\n";
235 };
236
237 print "\n----------------------------------\n";
238
239 unlink $temp_dir . $FILE;
240};
241
242
243
244print "Benchmarking took: ", timestr(timediff(Benchmark->new, $t0)), "\n";
245