blob: 7caa1dc7997e5d0eed0f35d8cd2395404c2c5ba6 [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Peter Harders994aff72020-07-25 09:53:35 +02008use Encode qw!decode!;
Akronaa229a22020-02-18 13:44:25 +01009use FindBin;
10use Getopt::Long;
11
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
Peter Harders42e18a62020-07-21 02:43:26 +020016use Test::KorAP::XML::TEI qw!korap_tempfile!;
Akron95bc98a2020-07-11 12:00:12 +020017use KorAP::XML::TEI 'remove_xml_comments';
Akrond9627472020-07-09 16:53:09 +020018use KorAP::XML::TEI::Tokenizer::Aggressive;
19use KorAP::XML::TEI::Tokenizer::Conservative;
Akrona10ad592020-08-03 11:20:23 +020020use KorAP::XML::TEI::Data;
Akron4f67cd42020-07-02 12:27:58 +020021
Akronaa229a22020-02-18 13:44:25 +010022my $columns = 0;
23my $no_header = 0;
24GetOptions(
25 'columns|c' => \$columns,
26 'no-header|n' => \$no_header,
27 'help|h' => sub {
28 print "--columns|-c Print instances in columns\n";
29 print "--no-header|-n Dismiss benchmark names\n";
30 print "--help|-h Print this page\n\n";
31 exit(0);
32 }
33);
34
35our $SCRIPT_NAME = 'tei2korapxml';
36
37my $f = dirname(__FILE__);
38my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
39
Akrone68ec0c2020-07-28 18:06:19 +020040# Load example files
Akronaa229a22020-02-18 13:44:25 +010041my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
Akrone68ec0c2020-07-28 18:06:19 +020042my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml'));
Marc Kupietza671ae52022-12-22 16:28:14 +010043my $icc_german = rel2abs(catfile($f, '..', 't', 'data', 'icc_german_sample.p5.xml'));
Akronaa229a22020-02-18 13:44:25 +010044
45# Create a new benchmark object
46my $bench = Dumbbench->new(
47 verbosity => 0
48);
49
Akron4f67cd42020-07-02 12:27:58 +020050my $result;
Akron510a88c2020-07-07 10:16:50 +020051
52# Data for delHTMLcom-long
Peter Harders42e18a62020-07-21 02:43:26 +020053my ($fh, $filename) = korap_tempfile('benchmark');
Akron2d547bc2020-07-04 10:34:35 +020054
55print $fh <<'HTML';
56mehrzeiliger
57Kommentar
58 --><!-- Versuch
59-->ist <!-- a --><!-- b --> ein Test
60HTML
61
Akron510a88c2020-07-07 10:16:50 +020062# Data for Tokenization
63# Test data
64my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
65my $t_data = '';
66if ((open(FH, '<' . $t_dataf))) {
Peter Harders994aff72020-07-25 09:53:35 +020067 binmode(FH);
Akron510a88c2020-07-07 10:16:50 +020068 while (!eof(FH)) {
69 $t_data .= <FH>
70 };
71 close(FH);
72}
73else {
74 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020075};
76
Peter Harders994aff72020-07-25 09:53:35 +020077my $t_data_utf_8 = decode('utf-8',$t_data);
Akrona10ad592020-08-03 11:20:23 +020078my @t_data_split = split(' ', $t_data);
Peter Harders994aff72020-07-25 09:53:35 +020079
Akrond9627472020-07-09 16:53:09 +020080my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
81my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020082
Akrona10ad592020-08-03 11:20:23 +020083my $data = KorAP::XML::TEI::Data->new;
Akron4f67cd42020-07-02 12:27:58 +020084
Akronaa229a22020-02-18 13:44:25 +010085# Add benchmark instances
86$bench->add_instances(
87 Dumbbench::Instance::PerlSub->new(
88 name => 'SimpleConversion',
89 code => sub {
Peter Hardersf9c51242020-07-21 02:37:44 +020090 `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
Akronaa229a22020-02-18 13:44:25 +010091 }
Akron4f67cd42020-07-02 12:27:58 +020092 ),
93 Dumbbench::Instance::PerlSub->new(
Akrone68ec0c2020-07-28 18:06:19 +020094 name => 'Conversion-with-inline-annotations',
95 code => sub {
96 `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1`
97 }
98 ),
99 Dumbbench::Instance::PerlSub->new(
Marc Kupietza671ae52022-12-22 16:28:14 +0100100 name => 'Conversion of standard TEI',
101 code => sub {
102 `cat '$icc_german' | perl '$script' > /dev/null 2>&1`
103 }
104 ),
105 Dumbbench::Instance::PerlSub->new(
Akron4f67cd42020-07-02 12:27:58 +0200106 name => 'delHTMLcom',
107 code => sub {
108 for (1..100_000) {
Akron95bc98a2020-07-11 12:00:12 +0200109 $result = remove_xml_comments(
Akron4f67cd42020-07-02 12:27:58 +0200110 \*STDIN,
111 "This <!-- comment --> is a test " . $_
112 );
113 };
114 }
115 ),
Akron2d547bc2020-07-04 10:34:35 +0200116 Dumbbench::Instance::PerlSub->new(
117 name => 'delHTMLcom-long',
118 code => sub {
119 for (1..10_000) {
Akron95bc98a2020-07-11 12:00:12 +0200120 $result = remove_xml_comments(
Akron2d547bc2020-07-04 10:34:35 +0200121 $fh,
122 "This <!--" . $_
123 );
124 seek($fh, 0, 0);
125 };
126 }
127 ),
Akron510a88c2020-07-07 10:16:50 +0200128 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200129 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200130 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200131 $result = $cons_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200132 $result = 0;
133 }
134 ),
135 Dumbbench::Instance::PerlSub->new(
Peter Harders994aff72020-07-25 09:53:35 +0200136 name => 'Tokenizer-conservative-utf-8',
137 code => sub {
138 $result = $cons_tok->reset->tokenize($t_data_utf_8);
139 $result = 0;
140 }
141 ),
142 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200143 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200144 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200145 $result = $aggr_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200146 $result = 0;
147 }
148 ),
Peter Harders994aff72020-07-25 09:53:35 +0200149 Dumbbench::Instance::PerlSub->new(
150 name => 'Tokenizer-aggressive-utf-8',
151 code => sub {
152 $result = $aggr_tok->reset->tokenize($t_data_utf_8);
153 $result = 0;
154 }
Akrona10ad592020-08-03 11:20:23 +0200155 ),
156 Dumbbench::Instance::PerlSub->new(
157 name => 'Data-Collect with serialization',
158 code => sub {
159 $data->reset->append($_) foreach @t_data_split;
160 $result = $data->to_string;
161 }
Peter Harders994aff72020-07-25 09:53:35 +0200162 )
Akronaa229a22020-02-18 13:44:25 +0100163);
164
165# Run benchmarks
166$bench->run;
167
Akron2d547bc2020-07-04 10:34:35 +0200168# Clean up
169close($fh);
170
Akronaa229a22020-02-18 13:44:25 +0100171# Output in a single row
172if ($columns) {
173 unless ($no_header) {
174 print join("\t", map { $_->name } $bench->instances), "\n";
175 };
176 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
177 exit(0);
178};
179
180# Output simple timings for comparation
181foreach my $inst ($bench->instances) {
182 unless ($no_header) {
183 print $inst->name, ': ';
184 };
185 print $inst->result->raw_number, "\n";
186};
187
188exit(0);
189
190__END__