blob: 9dcc1702a23fa5e17f6b609730c526515c804e20 [file] [log] [blame]
Akron43cc5c92022-03-02 14:25:30 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Benchmark qw!:hireswallclock :all!;
5use Data::Dumper;
6use POSIX 'round';
7
Akron9127d4f2022-03-11 10:54:46 +01008our @ARGV;
9
10# The first parameter is the batch size (how often is the text concatenated)
11# The second parameter is the number of iterations.
12
Akron43cc5c92022-03-02 14:25:30 +010013my $FILE = 'effi-1x-utf8.txt';
Akron43cc5c92022-03-02 14:25:30 +010014system 'gzip -dkf ./corpus/' . $FILE . '.gz';
Akron43cc5c92022-03-02 14:25:30 +010015my $iter = 1;
16
Akron9127d4f2022-03-11 10:54:46 +010017if ($ARGV[0]) {
18 my $number = $ARGV[0] + 0;
19 my $out = 'effi-'.$number.'x-utf8.txt';
20 for (my $i = 1; $i <= $number; $i++) {
21 system 'cat ./corpus/' . $FILE . ' >> ./corpus/' . $out;
22 };
23 $FILE = $out;
24};
25
26if ($ARGV[1]) {
27 $iter = $ARGV[1] + 0;
28};
29
Akron43cc5c92022-03-02 14:25:30 +010030# Result of wc -w
31my $effi_wc = `wc -w ./corpus/$FILE`;
32$effi_wc =~ s/^(\d+)\s.*$/$1/;
33
34
35my $models = {
36 'wc' => sub {
37 system 'wc -w ./corpus/'.$FILE.' > /dev/null';
38 },
39 'SoMaJo' => sub {
40 system 'somajo-tokenizer ./corpus/'.$FILE.' --split_sentences > /dev/null';
41 },
42 'SoMaJo_p2' => sub {
43 system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=2 --split_sentences > /dev/null';
44 },
Akron9127d4f2022-03-11 10:54:46 +010045 'SoMaJo_p4' => sub {
46 system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=4 --split_sentences > /dev/null';
47 },
48 'SoMaJo_p8' => sub {
49 system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=8 --split_sentences > /dev/null';
50 },
Akron43cc5c92022-03-02 14:25:30 +010051 'Datok_matok' => sub {
52 system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - > /dev/null'
53 },
54 'Datok_datok' => sub {
55 system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.datok - > /dev/null'
56 },
57 'OpenNLP_Simple' => sub {
58 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SimpleTokenizer > /dev/null';
59 },
60 'OpenNLP_Tokenizer_de-ud-gsd' => sub {
61 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin > /dev/null';
62 },
63 'OpenNLP_Sentence_de-ud-gsd' => sub {
64 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin > /dev/null';
65 },
66 'TreeTagger' => sub {
67 system 'cat ./corpus/'.$FILE.' | perl ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations > /dev/null';
68 },
69 'deep-eos_bi-lstm-de' => sub {
70 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag > /dev/null';
71 },
72 'deep-eos_cnn-de' => sub {
73 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag > /dev/null';
74 },
75 'deep-eos_lstm-de' => sub {
76 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag > /dev/null';
77 },
78 'JTok' => sub {
79 chdir '/euralex/JTok/bin';
80 system 'sh tokenize ../../corpus/'.$FILE.' de > /dev/null';
81 chdir '/euralex';
82 },
83 'KorAP-Tokenizer' => sub {
84 system 'cat ./corpus/'.$FILE.' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s > /dev/null'
85 },
86 Syntok_tokenizer => sub {
87 system 'python3 -m syntok.tokenizer ./corpus/'.$FILE.' > /dev/null';
88 },
89 Syntok_segmenter => sub {
90 system 'python3 -m syntok.segmenter ./corpus/'.$FILE.' > /dev/null';
91 },
92 Waste => sub {
Akroneb590da2022-03-02 18:31:34 +010093 system 'cat ./corpus/'.$FILE.' | waste -N -v0 --rcfile=./Waste/waste.rc > /dev/null';
Akron43cc5c92022-03-02 14:25:30 +010094 },
95 nnsplit => sub {
96 system './nnsplit/nnsplit_bench ./corpus/'.$FILE.' > /dev/null'
97 },
98 elephant => sub {
99 system './elephant-wrapper/bin/tokenize.sh -i ./corpus/'.$FILE.' UD_German > /dev/null'
100 },
Akronb6efc732022-03-17 15:03:56 +0100101 cutter => sub {
102 system 'python3 ./cutter/cutter.py nosent ./corpus/'.$FILE.' > /dev/null'
103 },
104 spacy_tok => sub {
Akron43cc5c92022-03-02 14:25:30 +0100105 system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
Akronc2616422022-03-07 09:19:38 +0100106 },
Akronb6efc732022-03-17 15:03:56 +0100107 spacy_dep => sub {
108 system 'python3 ./spacy/spacy_sent.py dep ./corpus/'.$FILE.' > /dev/null'
109 },
110 spacy_stat => sub {
111 system 'python3 ./spacy/spacy_sent.py stat ./corpus/'.$FILE.' > /dev/null'
112 },
113 spacy_sentencizer => sub {
114 system 'python3 ./spacy/spacy_sent.py sentencizer ./corpus/'.$FILE.' > /dev/null'
115 },
Akronc2616422022-03-07 09:19:38 +0100116 Stanford => sub {
117 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
118 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ./corpus/' . $FILE
119 },
Akron9127d4f2022-03-11 10:54:46 +0100120 Stanford_t2 => sub {
121 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
122 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=2 -file ./corpus/' . $FILE
123 },
Akronc2616422022-03-07 09:19:38 +0100124 Stanford_t4 => sub {
125 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
126 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=4 -file ./corpus/' . $FILE
Akron9127d4f2022-03-11 10:54:46 +0100127 },
128 Stanford_t8 => sub {
129 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
130 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=8 -file ./corpus/' . $FILE
Akronb6efc732022-03-17 15:03:56 +0100131 },
132 Stanford_tokonly => sub {
133 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
134 '-props german -annotators tokenize -tokenize.language=german -file ./corpus/' . $FILE
135 },
Akron43cc5c92022-03-02 14:25:30 +0100136};
137
138#delete $models->{'SoMaJo'};
139#delete $models->{'SoMaJo_p2'};
Akron9127d4f2022-03-11 10:54:46 +0100140#delete $models->{'SoMaJo_p4'};
141#delete $models->{'SoMaJo_p8'};
Akron43cc5c92022-03-02 14:25:30 +0100142#delete $models->{'Datok_matok'};
143#delete $models->{'Datok_datok'};
144#delete $models->{'OpenNLP_Simple'};
145#delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
146#delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
147#delete $models->{'TreeTagger'};
148#delete $models->{'deep-eos_bi-lstm-de'};
149#delete $models->{'deep-eos_cnn-de'};
150#delete $models->{'deep-eos_lstm-de'};
151#delete $models->{'JTok'};
152#delete $models->{'KorAP-Tokenizer'};
153#delete $models->{'Syntok_tokenizer'};
154#delete $models->{'Syntok_segmenter'};
155#delete $models->{'Waste'};
156#delete $models->{'nnsplit'};
157#delete $models->{'elephant'};
Akronc2616422022-03-07 09:19:38 +0100158#delete $models->{'Stanford'};
Akron9127d4f2022-03-11 10:54:46 +0100159#delete $models->{'Stanford_t2'};
Akronc2616422022-03-07 09:19:38 +0100160#delete $models->{'Stanford_t4'};
Akron9127d4f2022-03-11 10:54:46 +0100161#delete $models->{'Stanford_t8'};
Akronb6efc732022-03-17 15:03:56 +0100162delete $models->{'Stanford_tokonly'};
163delete $models->{'cutter'};
164delete $models->{'spacy_tok'};
165delete $models->{'spacy_sentencizer'};
166delete $models->{'spacy_dep'};
167delete $models->{'spacy_stat'};
168
Akron43cc5c92022-03-02 14:25:30 +0100169
170
171my $t0 = Benchmark->new;
172my $cmp = timethese($iter => $models);
173
174print "\n----------------------------------\n";
175
176foreach my $tool (sort keys %$cmp) {
177 my $seconds_per_run = $cmp->{$tool}->[0] / $cmp->{$tool}->[5];
178 my $tokens_per_msecond = ($effi_wc / $seconds_per_run) / 1000;
179 print $tool, "\t", $seconds_per_run, "\t", $tokens_per_msecond, "\t", sprintf("%.2f", $tokens_per_msecond), "\n";
180};
181
182print "\n----------------------------------\n";
183
184cmpthese($cmp);
185
186print "Benchmarking took: ", timestr(timediff(Benchmark->new, $t0)), "\n";
187