blob: 34074513aeb3c11b8bfd31808be187265ec7f18f [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Peter Harders994aff72020-07-25 09:53:35 +02008use Encode qw!decode!;
Akronaa229a22020-02-18 13:44:25 +01009use FindBin;
10use Getopt::Long;
11
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
Akron95bc98a2020-07-11 12:00:12 +020016use KorAP::XML::TEI 'remove_xml_comments';
Akrond9627472020-07-09 16:53:09 +020017use KorAP::XML::TEI::Tokenizer::Aggressive;
18use KorAP::XML::TEI::Tokenizer::Conservative;
Akron4f67cd42020-07-02 12:27:58 +020019
Akronaa229a22020-02-18 13:44:25 +010020my $columns = 0;
21my $no_header = 0;
22GetOptions(
23 'columns|c' => \$columns,
24 'no-header|n' => \$no_header,
25 'help|h' => sub {
26 print "--columns|-c Print instances in columns\n";
27 print "--no-header|-n Dismiss benchmark names\n";
28 print "--help|-h Print this page\n\n";
29 exit(0);
30 }
31);
32
33our $SCRIPT_NAME = 'tei2korapxml';
34
35my $f = dirname(__FILE__);
36my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
37
38# Load example file
39my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
40
41# Create a new benchmark object
42my $bench = Dumbbench->new(
43 verbosity => 0
44);
45
Akron4f67cd42020-07-02 12:27:58 +020046my $result;
Akron510a88c2020-07-07 10:16:50 +020047
48# Data for delHTMLcom-long
Akron2d547bc2020-07-04 10:34:35 +020049my ($fh, $filename) = tempfile();
50
51print $fh <<'HTML';
52mehrzeiliger
53Kommentar
54 --><!-- Versuch
55-->ist <!-- a --><!-- b --> ein Test
56HTML
57
Akron510a88c2020-07-07 10:16:50 +020058# Data for Tokenization
59# Test data
60my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
61my $t_data = '';
62if ((open(FH, '<' . $t_dataf))) {
Peter Harders994aff72020-07-25 09:53:35 +020063 binmode(FH);
Akron510a88c2020-07-07 10:16:50 +020064 while (!eof(FH)) {
65 $t_data .= <FH>
66 };
67 close(FH);
68}
69else {
70 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020071};
72
Peter Harders994aff72020-07-25 09:53:35 +020073my $t_data_utf_8 = decode('utf-8',$t_data);
74
Akrond9627472020-07-09 16:53:09 +020075my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
76my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020077
Akron4f67cd42020-07-02 12:27:58 +020078
Akronaa229a22020-02-18 13:44:25 +010079# Add benchmark instances
80$bench->add_instances(
81 Dumbbench::Instance::PerlSub->new(
82 name => 'SimpleConversion',
83 code => sub {
Peter Hardersf9c51242020-07-21 02:37:44 +020084 `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
Akronaa229a22020-02-18 13:44:25 +010085 }
Akron4f67cd42020-07-02 12:27:58 +020086 ),
87 Dumbbench::Instance::PerlSub->new(
88 name => 'delHTMLcom',
89 code => sub {
90 for (1..100_000) {
Akron95bc98a2020-07-11 12:00:12 +020091 $result = remove_xml_comments(
Akron4f67cd42020-07-02 12:27:58 +020092 \*STDIN,
93 "This <!-- comment --> is a test " . $_
94 );
95 };
96 }
97 ),
Akron2d547bc2020-07-04 10:34:35 +020098 Dumbbench::Instance::PerlSub->new(
99 name => 'delHTMLcom-long',
100 code => sub {
101 for (1..10_000) {
Akron95bc98a2020-07-11 12:00:12 +0200102 $result = remove_xml_comments(
Akron2d547bc2020-07-04 10:34:35 +0200103 $fh,
104 "This <!--" . $_
105 );
106 seek($fh, 0, 0);
107 };
108 }
109 ),
Akron510a88c2020-07-07 10:16:50 +0200110 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200111 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200112 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200113 $result = $cons_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200114 $result = 0;
115 }
116 ),
117 Dumbbench::Instance::PerlSub->new(
Peter Harders994aff72020-07-25 09:53:35 +0200118 name => 'Tokenizer-conservative-utf-8',
119 code => sub {
120 $result = $cons_tok->reset->tokenize($t_data_utf_8);
121 $result = 0;
122 }
123 ),
124 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200125 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200126 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200127 $result = $aggr_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200128 $result = 0;
129 }
130 ),
Peter Harders994aff72020-07-25 09:53:35 +0200131 Dumbbench::Instance::PerlSub->new(
132 name => 'Tokenizer-aggressive-utf-8',
133 code => sub {
134 $result = $aggr_tok->reset->tokenize($t_data_utf_8);
135 $result = 0;
136 }
137 )
Akronaa229a22020-02-18 13:44:25 +0100138);
139
140# Run benchmarks
141$bench->run;
142
Akron2d547bc2020-07-04 10:34:35 +0200143# Clean up
144close($fh);
145
Akronaa229a22020-02-18 13:44:25 +0100146# Output in a single row
147if ($columns) {
148 unless ($no_header) {
149 print join("\t", map { $_->name } $bench->instances), "\n";
150 };
151 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
152 exit(0);
153};
154
155# Output simple timings for comparation
156foreach my $inst ($bench->instances) {
157 unless ($no_header) {
158 print $inst->name, ': ';
159 };
160 print $inst->result->raw_number, "\n";
161};
162
163exit(0);
164
165__END__