Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use Benchmark qw!:hireswallclock :all!; |
| 5 | use Data::Dumper; |
| 6 | use POSIX 'round'; |
| 7 | |
Akron | 9127d4f | 2022-03-11 10:54:46 +0100 | [diff] [blame] | 8 | our @ARGV; |
| 9 | |
| 10 | # The first parameter is the batch size (how often is the text concatenated) |
| 11 | # The second parameter is the number of iterations. |
| 12 | |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 13 | my $FILE = 'effi-1x-utf8.txt'; |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 14 | system 'gzip -dkf ./corpus/' . $FILE . '.gz'; |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 15 | my $iter = 1; |
| 16 | |
Akron | 9127d4f | 2022-03-11 10:54:46 +0100 | [diff] [blame] | 17 | if ($ARGV[0]) { |
| 18 | my $number = $ARGV[0] + 0; |
| 19 | my $out = 'effi-'.$number.'x-utf8.txt'; |
| 20 | for (my $i = 1; $i <= $number; $i++) { |
| 21 | system 'cat ./corpus/' . $FILE . ' >> ./corpus/' . $out; |
| 22 | }; |
| 23 | $FILE = $out; |
| 24 | }; |
| 25 | |
| 26 | if ($ARGV[1]) { |
| 27 | $iter = $ARGV[1] + 0; |
| 28 | }; |
| 29 | |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 30 | # Result of wc -w |
| 31 | my $effi_wc = `wc -w ./corpus/$FILE`; |
| 32 | $effi_wc =~ s/^(\d+)\s.*$/$1/; |
| 33 | |
| 34 | |
| 35 | my $models = { |
| 36 | 'wc' => sub { |
| 37 | system 'wc -w ./corpus/'.$FILE.' > /dev/null'; |
| 38 | }, |
| 39 | 'SoMaJo' => sub { |
| 40 | system 'somajo-tokenizer ./corpus/'.$FILE.' --split_sentences > /dev/null'; |
| 41 | }, |
| 42 | 'SoMaJo_p2' => sub { |
| 43 | system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=2 --split_sentences > /dev/null'; |
| 44 | }, |
Akron | 9127d4f | 2022-03-11 10:54:46 +0100 | [diff] [blame] | 45 | 'SoMaJo_p4' => sub { |
| 46 | system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=4 --split_sentences > /dev/null'; |
| 47 | }, |
| 48 | 'SoMaJo_p8' => sub { |
| 49 | system 'somajo-tokenizer ./corpus/'.$FILE.' --parallel=8 --split_sentences > /dev/null'; |
| 50 | }, |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 51 | 'Datok_matok' => sub { |
| 52 | system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.matok - > /dev/null' |
| 53 | }, |
| 54 | 'Datok_datok' => sub { |
| 55 | system 'cat ./corpus/'.$FILE.' | ./Datok/datok tokenize -t ./Datok/testdata/tokenizer.datok - > /dev/null' |
| 56 | }, |
| 57 | 'OpenNLP_Simple' => sub { |
| 58 | system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SimpleTokenizer > /dev/null'; |
| 59 | }, |
| 60 | 'OpenNLP_Tokenizer_de-ud-gsd' => sub { |
| 61 | system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp TokenizerME ./opennlp/models/opennlp-de-ud-gsd-tokens-1.0-1.9.3.bin > /dev/null'; |
| 62 | }, |
| 63 | 'OpenNLP_Sentence_de-ud-gsd' => sub { |
| 64 | system 'cat ./corpus/'.$FILE.' | ./opennlp/bin/opennlp SentenceDetector ./opennlp/models/opennlp-de-ud-gsd-sentence-1.0-1.9.3.bin > /dev/null'; |
| 65 | }, |
| 66 | 'TreeTagger' => sub { |
| 67 | system 'cat ./corpus/'.$FILE.' | perl ./treetagger/cmd/utf8-tokenize.perl -a ./treetagger/lib/german-abbreviations > /dev/null'; |
| 68 | }, |
| 69 | 'deep-eos_bi-lstm-de' => sub { |
| 70 | system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/bi-lstm-de.model --vocab-filename ./deep-eos/bi-lstm-de.vocab --eos-marker "§" tag > /dev/null'; |
| 71 | }, |
| 72 | 'deep-eos_cnn-de' => sub { |
| 73 | system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/cnn-de.model --vocab-filename ./deep-eos/cnn-de.vocab --eos-marker "§" tag > /dev/null'; |
| 74 | }, |
| 75 | 'deep-eos_lstm-de' => sub { |
| 76 | system 'python3 ./deep-eos/main.py --input-file ./corpus/'.$FILE.' --model-filename ./deep-eos/lstm-de.model --vocab-filename ./deep-eos/lstm-de.vocab --eos-marker "§" tag > /dev/null'; |
| 77 | }, |
| 78 | 'JTok' => sub { |
| 79 | chdir '/euralex/JTok/bin'; |
| 80 | system 'sh tokenize ../../corpus/'.$FILE.' de > /dev/null'; |
| 81 | chdir '/euralex'; |
| 82 | }, |
| 83 | 'KorAP-Tokenizer' => sub { |
| 84 | system 'cat ./corpus/'.$FILE.' | java -jar ./KorAP-Tokenizer/KorAP-Tokenizer.jar -l de -s > /dev/null' |
| 85 | }, |
| 86 | Syntok_tokenizer => sub { |
| 87 | system 'python3 -m syntok.tokenizer ./corpus/'.$FILE.' > /dev/null'; |
| 88 | }, |
| 89 | Syntok_segmenter => sub { |
| 90 | system 'python3 -m syntok.segmenter ./corpus/'.$FILE.' > /dev/null'; |
| 91 | }, |
| 92 | Waste => sub { |
Akron | eb590da | 2022-03-02 18:31:34 +0100 | [diff] [blame] | 93 | system 'cat ./corpus/'.$FILE.' | waste -N -v0 --rcfile=./Waste/waste.rc > /dev/null'; |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 94 | }, |
| 95 | nnsplit => sub { |
| 96 | system './nnsplit/nnsplit_bench ./corpus/'.$FILE.' > /dev/null' |
| 97 | }, |
| 98 | elephant => sub { |
| 99 | system './elephant-wrapper/bin/tokenize.sh -i ./corpus/'.$FILE.' UD_German > /dev/null' |
| 100 | }, |
Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 101 | cutter => sub { |
| 102 | system 'python3 ./cutter/cutter.py nosent ./corpus/'.$FILE.' > /dev/null' |
| 103 | }, |
Akron | 325193e | 2022-03-20 11:38:04 +0100 | [diff] [blame^] | 104 | blingfire_tok => sub { |
| 105 | system 'python3 ./blingfire/blingfire_tok.py ./corpus/'.$FILE.' > /dev/null' |
| 106 | }, |
| 107 | blingfire_sent => sub { |
| 108 | system 'python3 ./blingfire/blingfire_sent.py ./corpus/'.$FILE.' > /dev/null' |
| 109 | }, |
Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 110 | spacy_tok => sub { |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 111 | system 'python3 ./spacy/spacy_tok.py ./corpus/'.$FILE.' > /dev/null' |
Akron | c261642 | 2022-03-07 09:19:38 +0100 | [diff] [blame] | 112 | }, |
Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 113 | spacy_dep => sub { |
| 114 | system 'python3 ./spacy/spacy_sent.py dep ./corpus/'.$FILE.' > /dev/null' |
| 115 | }, |
| 116 | spacy_stat => sub { |
| 117 | system 'python3 ./spacy/spacy_sent.py stat ./corpus/'.$FILE.' > /dev/null' |
| 118 | }, |
| 119 | spacy_sentencizer => sub { |
| 120 | system 'python3 ./spacy/spacy_sent.py sentencizer ./corpus/'.$FILE.' > /dev/null' |
| 121 | }, |
Akron | c261642 | 2022-03-07 09:19:38 +0100 | [diff] [blame] | 122 | Stanford => sub { |
| 123 | system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' . |
| 124 | '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -file ./corpus/' . $FILE |
| 125 | }, |
Akron | 9127d4f | 2022-03-11 10:54:46 +0100 | [diff] [blame] | 126 | Stanford_t2 => sub { |
| 127 | system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' . |
| 128 | '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=2 -file ./corpus/' . $FILE |
| 129 | }, |
Akron | c261642 | 2022-03-07 09:19:38 +0100 | [diff] [blame] | 130 | Stanford_t4 => sub { |
| 131 | system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' . |
| 132 | '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=4 -file ./corpus/' . $FILE |
Akron | 9127d4f | 2022-03-11 10:54:46 +0100 | [diff] [blame] | 133 | }, |
| 134 | Stanford_t8 => sub { |
| 135 | system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' . |
| 136 | '-props german -annotators tokenize,ssplit,mwt -tokenize.language=german -threads=8 -file ./corpus/' . $FILE |
Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 137 | }, |
| 138 | Stanford_tokonly => sub { |
| 139 | system 'CLASSPATH=/euralex/stanford-corenlp-4.4.0/* java edu.stanford.nlp.pipeline.StanfordCoreNLP ' . |
| 140 | '-props german -annotators tokenize -tokenize.language=german -file ./corpus/' . $FILE |
| 141 | }, |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 142 | }; |
| 143 | |
Akron | 325193e | 2022-03-20 11:38:04 +0100 | [diff] [blame^] | 144 | #delete $models->{'SoMaJo'}; |
| 145 | #delete $models->{'SoMaJo_p2'}; |
| 146 | #delete $models->{'SoMaJo_p4'}; |
| 147 | #delete $models->{'SoMaJo_p8'}; |
| 148 | #delete $models->{'Datok_matok'}; |
| 149 | #delete $models->{'Datok_datok'}; |
| 150 | #delete $models->{'OpenNLP_Simple'}; |
| 151 | #delete $models->{'OpenNLP_Tokenizer_de-ud-gsd'}; |
| 152 | #delete $models->{'OpenNLP_Sentence_de-ud-gsd'}; |
| 153 | #delete $models->{'TreeTagger'}; |
| 154 | #delete $models->{'deep-eos_bi-lstm-de'}; |
| 155 | #delete $models->{'deep-eos_cnn-de'}; |
| 156 | #delete $models->{'deep-eos_lstm-de'}; |
| 157 | #delete $models->{'JTok'}; |
| 158 | #delete $models->{'KorAP-Tokenizer'}; |
| 159 | #delete $models->{'Syntok_tokenizer'}; |
| 160 | #delete $models->{'Syntok_segmenter'}; |
| 161 | #delete $models->{'Waste'}; |
| 162 | #delete $models->{'nnsplit'}; |
| 163 | #delete $models->{'elephant'}; |
| 164 | #delete $models->{'Stanford'}; |
| 165 | #delete $models->{'Stanford_t2'}; |
| 166 | #delete $models->{'Stanford_t4'}; |
| 167 | #delete $models->{'Stanford_t8'}; |
Akron | 54fd314 | 2022-03-17 17:45:12 +0100 | [diff] [blame] | 168 | #delete $models->{'Stanford_tokonly'}; |
| 169 | #delete $models->{'cutter'}; |
| 170 | #delete $models->{'spacy_tok'}; |
| 171 | #delete $models->{'spacy_sentencizer'}; |
| 172 | #delete $models->{'spacy_dep'}; |
| 173 | #delete $models->{'spacy_stat'}; |
Akron | 325193e | 2022-03-20 11:38:04 +0100 | [diff] [blame^] | 174 | #delete $models->{'blingfire_tok'}; |
| 175 | #delete $models->{'blingfire_sent'}; |
Akron | b6efc73 | 2022-03-17 15:03:56 +0100 | [diff] [blame] | 176 | |
Akron | 43cc5c9 | 2022-03-02 14:25:30 +0100 | [diff] [blame] | 177 | |
| 178 | |
| 179 | my $t0 = Benchmark->new; |
| 180 | my $cmp = timethese($iter => $models); |
| 181 | |
| 182 | print "\n----------------------------------\n"; |
| 183 | |
| 184 | foreach my $tool (sort keys %$cmp) { |
| 185 | my $seconds_per_run = $cmp->{$tool}->[0] / $cmp->{$tool}->[5]; |
| 186 | my $tokens_per_msecond = ($effi_wc / $seconds_per_run) / 1000; |
| 187 | print $tool, "\t", $seconds_per_run, "\t", $tokens_per_msecond, "\t", sprintf("%.2f", $tokens_per_msecond), "\n"; |
| 188 | }; |
| 189 | |
| 190 | print "\n----------------------------------\n"; |
| 191 | |
| 192 | cmpthese($cmp); |
| 193 | |
| 194 | print "Benchmarking took: ", timestr(timediff(Benchmark->new, $t0)), "\n"; |
| 195 | |