blob: 163b85b5344bf659924b3f4e0c964d8d9f1b3c29 [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Peter Harders994aff72020-07-25 09:53:35 +02008use Encode qw!decode!;
Akronaa229a22020-02-18 13:44:25 +01009use FindBin;
10use Getopt::Long;
11
12BEGIN {
13 unshift @INC, "$FindBin::Bin/../lib";
14};
15
Peter Harders42e18a62020-07-21 02:43:26 +020016use Test::KorAP::XML::TEI qw!korap_tempfile!;
Akron95bc98a2020-07-11 12:00:12 +020017use KorAP::XML::TEI 'remove_xml_comments';
Akrond9627472020-07-09 16:53:09 +020018use KorAP::XML::TEI::Tokenizer::Aggressive;
19use KorAP::XML::TEI::Tokenizer::Conservative;
Akron4f67cd42020-07-02 12:27:58 +020020
Akronaa229a22020-02-18 13:44:25 +010021my $columns = 0;
22my $no_header = 0;
23GetOptions(
24 'columns|c' => \$columns,
25 'no-header|n' => \$no_header,
26 'help|h' => sub {
27 print "--columns|-c Print instances in columns\n";
28 print "--no-header|-n Dismiss benchmark names\n";
29 print "--help|-h Print this page\n\n";
30 exit(0);
31 }
32);
33
34our $SCRIPT_NAME = 'tei2korapxml';
35
36my $f = dirname(__FILE__);
37my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
38
39# Load example file
40my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
41
42# Create a new benchmark object
43my $bench = Dumbbench->new(
44 verbosity => 0
45);
46
Akron4f67cd42020-07-02 12:27:58 +020047my $result;
Akron510a88c2020-07-07 10:16:50 +020048
49# Data for delHTMLcom-long
Peter Harders42e18a62020-07-21 02:43:26 +020050my ($fh, $filename) = korap_tempfile('benchmark');
Akron2d547bc2020-07-04 10:34:35 +020051
52print $fh <<'HTML';
53mehrzeiliger
54Kommentar
55 --><!-- Versuch
56-->ist <!-- a --><!-- b --> ein Test
57HTML
58
Akron510a88c2020-07-07 10:16:50 +020059# Data for Tokenization
60# Test data
61my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
62my $t_data = '';
63if ((open(FH, '<' . $t_dataf))) {
Peter Harders994aff72020-07-25 09:53:35 +020064 binmode(FH);
Akron510a88c2020-07-07 10:16:50 +020065 while (!eof(FH)) {
66 $t_data .= <FH>
67 };
68 close(FH);
69}
70else {
71 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020072};
73
Peter Harders994aff72020-07-25 09:53:35 +020074my $t_data_utf_8 = decode('utf-8',$t_data);
75
Akrond9627472020-07-09 16:53:09 +020076my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
77my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020078
Akron4f67cd42020-07-02 12:27:58 +020079
Akronaa229a22020-02-18 13:44:25 +010080# Add benchmark instances
81$bench->add_instances(
82 Dumbbench::Instance::PerlSub->new(
83 name => 'SimpleConversion',
84 code => sub {
Peter Hardersf9c51242020-07-21 02:37:44 +020085 `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
Akronaa229a22020-02-18 13:44:25 +010086 }
Akron4f67cd42020-07-02 12:27:58 +020087 ),
88 Dumbbench::Instance::PerlSub->new(
89 name => 'delHTMLcom',
90 code => sub {
91 for (1..100_000) {
Akron95bc98a2020-07-11 12:00:12 +020092 $result = remove_xml_comments(
Akron4f67cd42020-07-02 12:27:58 +020093 \*STDIN,
94 "This <!-- comment --> is a test " . $_
95 );
96 };
97 }
98 ),
Akron2d547bc2020-07-04 10:34:35 +020099 Dumbbench::Instance::PerlSub->new(
100 name => 'delHTMLcom-long',
101 code => sub {
102 for (1..10_000) {
Akron95bc98a2020-07-11 12:00:12 +0200103 $result = remove_xml_comments(
Akron2d547bc2020-07-04 10:34:35 +0200104 $fh,
105 "This <!--" . $_
106 );
107 seek($fh, 0, 0);
108 };
109 }
110 ),
Akron510a88c2020-07-07 10:16:50 +0200111 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200112 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200113 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200114 $result = $cons_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200115 $result = 0;
116 }
117 ),
118 Dumbbench::Instance::PerlSub->new(
Peter Harders994aff72020-07-25 09:53:35 +0200119 name => 'Tokenizer-conservative-utf-8',
120 code => sub {
121 $result = $cons_tok->reset->tokenize($t_data_utf_8);
122 $result = 0;
123 }
124 ),
125 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200126 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200127 code => sub {
Peter Hardersb1227172020-07-21 02:12:10 +0200128 $result = $aggr_tok->reset->tokenize($t_data);
Akron510a88c2020-07-07 10:16:50 +0200129 $result = 0;
130 }
131 ),
Peter Harders994aff72020-07-25 09:53:35 +0200132 Dumbbench::Instance::PerlSub->new(
133 name => 'Tokenizer-aggressive-utf-8',
134 code => sub {
135 $result = $aggr_tok->reset->tokenize($t_data_utf_8);
136 $result = 0;
137 }
138 )
Akronaa229a22020-02-18 13:44:25 +0100139);
140
141# Run benchmarks
142$bench->run;
143
Akron2d547bc2020-07-04 10:34:35 +0200144# Clean up
145close($fh);
146
Akronaa229a22020-02-18 13:44:25 +0100147# Output in a single row
148if ($columns) {
149 unless ($no_header) {
150 print join("\t", map { $_->name } $bench->instances), "\n";
151 };
152 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
153 exit(0);
154};
155
156# Output simple timings for comparation
157foreach my $inst ($bench->instances) {
158 unless ($no_header) {
159 print $inst->name, ': ';
160 };
161 print $inst->result->raw_number, "\n";
162};
163
164exit(0);
165
166__END__