blob: f618717127ef0d4ee5bc6cbf2ea95a8363f62693 [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Peter Harders994aff72020-07-25 09:53:35 +02008use Encode qw!decode!;
Akronaa229a22020-02-18 13:44:25 +01009use FindBin;
10use Getopt::Long;
11
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
Peter Harders42e18a62020-07-21 02:43:26 +020016use Test::KorAP::XML::TEI qw!korap_tempfile!;
Akron95bc98a2020-07-11 12:00:12 +020017use KorAP::XML::TEI 'remove_xml_comments';
Akrond9627472020-07-09 16:53:09 +020018use KorAP::XML::TEI::Tokenizer::Aggressive;
19use KorAP::XML::TEI::Tokenizer::Conservative;
Akrona10ad592020-08-03 11:20:23 +020020use KorAP::XML::TEI::Data;
Akron4f67cd42020-07-02 12:27:58 +020021
Akronaa229a22020-02-18 13:44:25 +010022my $columns = 0;
23my $no_header = 0;
24GetOptions(
25 'columns|c' => \$columns,
26 'no-header|n' => \$no_header,
27 'help|h' => sub {
28 print "--columns|-c Print instances in columns\n";
29 print "--no-header|-n Dismiss benchmark names\n";
30 print "--help|-h Print this page\n\n";
31 exit(0);
32 }
33);
34
35our $SCRIPT_NAME = 'tei2korapxml';
36
37my $f = dirname(__FILE__);
38my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
39
Akrone68ec0c2020-07-28 18:06:19 +020040# Load example files
Akronaa229a22020-02-18 13:44:25 +010041my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
Akrone68ec0c2020-07-28 18:06:19 +020042my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml'));
Akronaa229a22020-02-18 13:44:25 +010043
44# Create a new benchmark object
45my $bench = Dumbbench->new(
46 verbosity => 0
47);
48
Akron4f67cd42020-07-02 12:27:58 +020049my $result;
Akron510a88c2020-07-07 10:16:50 +020050
51# Data for delHTMLcom-long
Peter Harders42e18a62020-07-21 02:43:26 +020052my ($fh, $filename) = korap_tempfile('benchmark');
Akron2d547bc2020-07-04 10:34:35 +020053
54print $fh <<'HTML';
55mehrzeiliger
56Kommentar
57 --><!-- Versuch
58-->ist <!-- a --><!-- b --> ein Test
59HTML
60
Akron510a88c2020-07-07 10:16:50 +020061# Data for Tokenization
62# Test data
63my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
64my $t_data = '';
65if ((open(FH, '<' . $t_dataf))) {
Peter Harders994aff72020-07-25 09:53:35 +020066 binmode(FH);
Akron510a88c2020-07-07 10:16:50 +020067 while (!eof(FH)) {
68 $t_data .= <FH>
69 };
70 close(FH);
71}
72else {
73 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020074};
75
Peter Harders994aff72020-07-25 09:53:35 +020076my $t_data_utf_8 = decode('utf-8',$t_data);
Akrona10ad592020-08-03 11:20:23 +020077my @t_data_split = split(' ', $t_data);
Peter Harders994aff72020-07-25 09:53:35 +020078
Akrond9627472020-07-09 16:53:09 +020079my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
80my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020081
Akrona10ad592020-08-03 11:20:23 +020082my $data = KorAP::XML::TEI::Data->new;
Akron4f67cd42020-07-02 12:27:58 +020083
Akronaa229a22020-02-18 13:44:25 +010084# Add benchmark instances
85$bench->add_instances(
86 Dumbbench::Instance::PerlSub->new(
87 name => 'SimpleConversion',
88 code => sub {
Peter Hardersf9c51242020-07-21 02:37:44 +020089 `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
Akronaa229a22020-02-18 13:44:25 +010090 }
Akron4f67cd42020-07-02 12:27:58 +020091 ),
92 Dumbbench::Instance::PerlSub->new(
Akrone68ec0c2020-07-28 18:06:19 +020093 name => 'Conversion-with-inline-annotations',
94 code => sub {
95 `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1`
96 }
97 ),
98 Dumbbench::Instance::PerlSub->new(
Akron4f67cd42020-07-02 12:27:58 +020099 name => 'delHTMLcom',
100 code => sub {
101 for (1..100_000) {
Akron95bc98a2020-07-11 12:00:12 +0200102 $result = remove_xml_comments(
Akron4f67cd42020-07-02 12:27:58 +0200103 \*STDIN,
104 "This <!-- comment --> is a test " . $_
105 );
106 };
107 }
108 ),
Akron2d547bc2020-07-04 10:34:35 +0200109 Dumbbench::Instance::PerlSub->new(
110 name => 'delHTMLcom-long',
111 code => sub {
112 for (1..10_000) {
Akron95bc98a2020-07-11 12:00:12 +0200113 $result = remove_xml_comments(
Akron2d547bc2020-07-04 10:34:35 +0200114 $fh,
115 "This <!--" . $_
116 );
117 seek($fh, 0, 0);
118 };
119 }
120 ),
Akron510a88c2020-07-07 10:16:50 +0200121 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200122 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200123 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200124 $result = $cons_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200125 $result = 0;
126 }
127 ),
128 Dumbbench::Instance::PerlSub->new(
Peter Harders994aff72020-07-25 09:53:35 +0200129 name => 'Tokenizer-conservative-utf-8',
130 code => sub {
131 $result = $cons_tok->reset->tokenize($t_data_utf_8);
132 $result = 0;
133 }
134 ),
135 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200136 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200137 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200138 $result = $aggr_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200139 $result = 0;
140 }
141 ),
Peter Harders994aff72020-07-25 09:53:35 +0200142 Dumbbench::Instance::PerlSub->new(
143 name => 'Tokenizer-aggressive-utf-8',
144 code => sub {
145 $result = $aggr_tok->reset->tokenize($t_data_utf_8);
146 $result = 0;
147 }
Akrona10ad592020-08-03 11:20:23 +0200148 ),
149 Dumbbench::Instance::PerlSub->new(
150 name => 'Data-Collect with serialization',
151 code => sub {
152 $data->reset->append($_) foreach @t_data_split;
153 $result = $data->to_string;
154 }
Peter Harders994aff72020-07-25 09:53:35 +0200155 )
Akronaa229a22020-02-18 13:44:25 +0100156);
157
158# Run benchmarks
159$bench->run;
160
Akron2d547bc2020-07-04 10:34:35 +0200161# Clean up
162close($fh);
163
Akronaa229a22020-02-18 13:44:25 +0100164# Output in a single row
165if ($columns) {
166 unless ($no_header) {
167 print join("\t", map { $_->name } $bench->instances), "\n";
168 };
169 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
170 exit(0);
171};
172
173# Output simple timings for comparation
174foreach my $inst ($bench->instances) {
175 unless ($no_header) {
176 print $inst->name, ': ';
177 };
178 print $inst->result->raw_number, "\n";
179};
180
181exit(0);
182
183__END__