blob: 71dbe542d05790faf485a85b27282565a3d7beba [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Peter Harders994aff72020-07-25 09:53:35 +02008use Encode qw!decode!;
Akronaa229a22020-02-18 13:44:25 +01009use FindBin;
10use Getopt::Long;
11
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
Peter Harders42e18a62020-07-21 02:43:26 +020016use Test::KorAP::XML::TEI qw!korap_tempfile!;
Akron95bc98a2020-07-11 12:00:12 +020017use KorAP::XML::TEI 'remove_xml_comments';
Akrond9627472020-07-09 16:53:09 +020018use KorAP::XML::TEI::Tokenizer::Aggressive;
19use KorAP::XML::TEI::Tokenizer::Conservative;
Akron4f67cd42020-07-02 12:27:58 +020020
Akronaa229a22020-02-18 13:44:25 +010021my $columns = 0;
22my $no_header = 0;
23GetOptions(
24 'columns|c' => \$columns,
25 'no-header|n' => \$no_header,
26 'help|h' => sub {
27 print "--columns|-c Print instances in columns\n";
28 print "--no-header|-n Dismiss benchmark names\n";
29 print "--help|-h Print this page\n\n";
30 exit(0);
31 }
32);
33
34our $SCRIPT_NAME = 'tei2korapxml';
35
36my $f = dirname(__FILE__);
37my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
38
Akrone68ec0c2020-07-28 18:06:19 +020039# Load example files
Akronaa229a22020-02-18 13:44:25 +010040my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
Akrone68ec0c2020-07-28 18:06:19 +020041my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml'));
Akronaa229a22020-02-18 13:44:25 +010042
43# Create a new benchmark object
44my $bench = Dumbbench->new(
45 verbosity => 0
46);
47
Akron4f67cd42020-07-02 12:27:58 +020048my $result;
Akron510a88c2020-07-07 10:16:50 +020049
50# Data for delHTMLcom-long
Peter Harders42e18a62020-07-21 02:43:26 +020051my ($fh, $filename) = korap_tempfile('benchmark');
Akron2d547bc2020-07-04 10:34:35 +020052
53print $fh <<'HTML';
54mehrzeiliger
55Kommentar
56 --><!-- Versuch
57-->ist <!-- a --><!-- b --> ein Test
58HTML
59
Akron510a88c2020-07-07 10:16:50 +020060# Data for Tokenization
61# Test data
62my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
63my $t_data = '';
64if ((open(FH, '<' . $t_dataf))) {
Peter Harders994aff72020-07-25 09:53:35 +020065 binmode(FH);
Akron510a88c2020-07-07 10:16:50 +020066 while (!eof(FH)) {
67 $t_data .= <FH>
68 };
69 close(FH);
70}
71else {
72 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020073};
74
Peter Harders994aff72020-07-25 09:53:35 +020075my $t_data_utf_8 = decode('utf-8',$t_data);
76
Akrond9627472020-07-09 16:53:09 +020077my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
78my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020079
Akron4f67cd42020-07-02 12:27:58 +020080
Akronaa229a22020-02-18 13:44:25 +010081# Add benchmark instances
82$bench->add_instances(
83 Dumbbench::Instance::PerlSub->new(
84 name => 'SimpleConversion',
85 code => sub {
Peter Hardersf9c51242020-07-21 02:37:44 +020086 `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
Akronaa229a22020-02-18 13:44:25 +010087 }
Akron4f67cd42020-07-02 12:27:58 +020088 ),
89 Dumbbench::Instance::PerlSub->new(
Akrone68ec0c2020-07-28 18:06:19 +020090 name => 'Conversion-with-inline-annotations',
91 code => sub {
92 `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1`
93 }
94 ),
95 Dumbbench::Instance::PerlSub->new(
Akron4f67cd42020-07-02 12:27:58 +020096 name => 'delHTMLcom',
97 code => sub {
98 for (1..100_000) {
Akron95bc98a2020-07-11 12:00:12 +020099 $result = remove_xml_comments(
Akron4f67cd42020-07-02 12:27:58 +0200100 \*STDIN,
101 "This <!-- comment --> is a test " . $_
102 );
103 };
104 }
105 ),
Akron2d547bc2020-07-04 10:34:35 +0200106 Dumbbench::Instance::PerlSub->new(
107 name => 'delHTMLcom-long',
108 code => sub {
109 for (1..10_000) {
Akron95bc98a2020-07-11 12:00:12 +0200110 $result = remove_xml_comments(
Akron2d547bc2020-07-04 10:34:35 +0200111 $fh,
112 "This <!--" . $_
113 );
114 seek($fh, 0, 0);
115 };
116 }
117 ),
Akron510a88c2020-07-07 10:16:50 +0200118 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200119 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200120 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200121 $result = $cons_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200122 $result = 0;
123 }
124 ),
125 Dumbbench::Instance::PerlSub->new(
Peter Harders994aff72020-07-25 09:53:35 +0200126 name => 'Tokenizer-conservative-utf-8',
127 code => sub {
128 $result = $cons_tok->reset->tokenize($t_data_utf_8);
129 $result = 0;
130 }
131 ),
132 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200133 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200134 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200135 $result = $aggr_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200136 $result = 0;
137 }
138 ),
Peter Harders994aff72020-07-25 09:53:35 +0200139 Dumbbench::Instance::PerlSub->new(
140 name => 'Tokenizer-aggressive-utf-8',
141 code => sub {
142 $result = $aggr_tok->reset->tokenize($t_data_utf_8);
143 $result = 0;
144 }
145 )
Akronaa229a22020-02-18 13:44:25 +0100146);
147
148# Run benchmarks
149$bench->run;
150
Akron2d547bc2020-07-04 10:34:35 +0200151# Clean up
152close($fh);
153
Akronaa229a22020-02-18 13:44:25 +0100154# Output in a single row
155if ($columns) {
156 unless ($no_header) {
157 print join("\t", map { $_->name } $bench->instances), "\n";
158 };
159 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
160 exit(0);
161};
162
163# Output simple timings for comparation
164foreach my $inst ($bench->instances) {
165 unless ($no_header) {
166 print $inst->name, ': ';
167 };
168 print $inst->result->raw_number, "\n";
169};
170
171exit(0);
172
173__END__