blob: 9ad5bd9d3171d931c0b9d8699269958174fb39b7 [file] [log] [blame]
Akron43cc5c92022-03-02 14:25:30 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Benchmark qw!:hireswallclock :all!;
5use Data::Dumper;
6use POSIX 'round';
7
8my $FILE = 'effi-1x-utf8.txt';
9
10system 'gzip -dkf ./corpus/' . $FILE . '.gz';
11
12my $iter = 1;
13
14# Result of wc -w
15my $effi_wc = `wc -w ./corpus/$FILE`;
16$effi_wc =~ s/^(\d+)\s.*$/$1/;
17
18
19my $models = {
20 'wc' => sub {
21 system 'wc -w ./corpus/'.$FILE.' > /dev/null';
22 },
23 'SoMaJo' => sub {
24 system 'somajo-tokenizer ./corpus/'.$FILE.' --split_sentences > /dev/null';
25 },
26 'SoMaJo_p2' => sub {
27 system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=2 --split_sentences > /dev/null';
28 },
29 'Datok_matok' => sub {
30 system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - > /dev/null'
31 },
32 'Datok_datok' => sub {
33 system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.datok - > /dev/null'
34 },
35 'OpenNLP_Simple' => sub {
36 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SimpleTokenizer > /dev/null';
37 },
38 'OpenNLP_Tokenizer_de-ud-gsd' => sub {
39 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin > /dev/null';
40 },
41 'OpenNLP_Sentence_de-ud-gsd' => sub {
42 system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin > /dev/null';
43 },
44 'TreeTagger' => sub {
45 system 'cat ./corpus/'.$FILE.' | perl ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations > /dev/null';
46 },
47 'deep-eos_bi-lstm-de' => sub {
48 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag > /dev/null';
49 },
50 'deep-eos_cnn-de' => sub {
51 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag > /dev/null';
52 },
53 'deep-eos_lstm-de' => sub {
54 system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag > /dev/null';
55 },
56 'JTok' => sub {
57 chdir '/euralex/JTok/bin';
58 system 'sh tokenize ../../corpus/'.$FILE.' de > /dev/null';
59 chdir '/euralex';
60 },
61 'KorAP-Tokenizer' => sub {
62 system 'cat ./corpus/'.$FILE.' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s > /dev/null'
63 },
64 Syntok_tokenizer => sub {
65 system 'python3 -m syntok.tokenizer ./corpus/'.$FILE.' > /dev/null';
66 },
67 Syntok_segmenter => sub {
68 system 'python3 -m syntok.segmenter ./corpus/'.$FILE.' > /dev/null';
69 },
70 Waste => sub {
Akroneb590da2022-03-02 18:31:34 +010071 system 'cat ./corpus/'.$FILE.' | waste -N -v0 --rcfile=./Waste/waste.rc > /dev/null';
Akron43cc5c92022-03-02 14:25:30 +010072 },
73 nnsplit => sub {
74 system './nnsplit/nnsplit_bench ./corpus/'.$FILE.' > /dev/null'
75 },
76 elephant => sub {
77 system './elephant-wrapper/bin/tokenize.sh -i ./corpus/'.$FILE.' UD_German > /dev/null'
78 },
79 SpaCy => sub {
80 system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null'
Akronc2616422022-03-07 09:19:38 +010081 },
82 Stanford => sub {
83 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
84 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ./corpus/' . $FILE
85 },
86 Stanford_t4 => sub {
87 system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' .
88 '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=4 -file ./corpus/' . $FILE
Akron43cc5c92022-03-02 14:25:30 +010089 }
90};
91
92#delete $models->{'SoMaJo'};
93#delete $models->{'SoMaJo_p2'};
94#delete $models->{'Datok_matok'};
95#delete $models->{'Datok_datok'};
96#delete $models->{'OpenNLP_Simple'};
97#delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'};
98#delete $models->{'OpenNLP_Sentence_de-ud-gsd'};
99#delete $models->{'TreeTagger'};
100#delete $models->{'deep-eos_bi-lstm-de'};
101#delete $models->{'deep-eos_cnn-de'};
102#delete $models->{'deep-eos_lstm-de'};
103#delete $models->{'JTok'};
104#delete $models->{'KorAP-Tokenizer'};
105#delete $models->{'Syntok_tokenizer'};
106#delete $models->{'Syntok_segmenter'};
107#delete $models->{'Waste'};
108#delete $models->{'nnsplit'};
109#delete $models->{'elephant'};
110#delete $models->{'SpaCy'};
Akronc2616422022-03-07 09:19:38 +0100111#delete $models->{'Stanford'};
112#delete $models->{'Stanford_t4'};
Akron43cc5c92022-03-02 14:25:30 +0100113
114
115my $t0 = Benchmark->new;
116my $cmp = timethese($iter => $models);
117
118print "\n----------------------------------\n";
119
120foreach my $tool (sort keys %$cmp) {
121 my $seconds_per_run = $cmp->{$tool}->[0] / $cmp->{$tool}->[5];
122 my $tokens_per_msecond = ($effi_wc / $seconds_per_run) / 1000;
123 print $tool, "\t", $seconds_per_run, "\t", $tokens_per_msecond, "\t", sprintf("%.2f", $tokens_per_msecond), "\n";
124};
125
126print "\n----------------------------------\n";
127
128cmpthese($cmp);
129
130print "Benchmarking took: ", timestr(timediff(Benchmark->new, $t0)), "\n";
131