blob: b27acd6ee7e52de7e078e41b525bdcb20cf49400 [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Akronaa229a22020-02-18 13:44:25 +01008use FindBin;
9use Getopt::Long;
10
11BEGIN {
12 unshift @INC, "$FindBin::Bin/../lib";
13};
14
Akron4f67cd42020-07-02 12:27:58 +020015use KorAP::XML::TEI;
Akrond9627472020-07-09 16:53:09 +020016use KorAP::XML::TEI::Tokenizer::Aggressive;
17use KorAP::XML::TEI::Tokenizer::Conservative;
Akron4f67cd42020-07-02 12:27:58 +020018
Akronaa229a22020-02-18 13:44:25 +010019my $columns = 0;
20my $no_header = 0;
21GetOptions(
22 'columns|c' => \$columns,
23 'no-header|n' => \$no_header,
24 'help|h' => sub {
25 print "--columns|-c Print instances in columns\n";
26 print "--no-header|-n Dismiss benchmark names\n";
27 print "--help|-h Print this page\n\n";
28 exit(0);
29 }
30);
31
32our $SCRIPT_NAME = 'tei2korapxml';
33
34my $f = dirname(__FILE__);
35my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
36
37# Load example file
38my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
39
40# Create a new benchmark object
41my $bench = Dumbbench->new(
42 verbosity => 0
43);
44
Akron4f67cd42020-07-02 12:27:58 +020045my $result;
Akron510a88c2020-07-07 10:16:50 +020046
47# Data for delHTMLcom-long
Akron2d547bc2020-07-04 10:34:35 +020048my ($fh, $filename) = tempfile();
49
50print $fh <<'HTML';
51mehrzeiliger
52Kommentar
53 --><!-- Versuch
54-->ist <!-- a --><!-- b --> ein Test
55HTML
56
Akron510a88c2020-07-07 10:16:50 +020057# Data for Tokenization
58# Test data
59my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
60my $t_data = '';
61if ((open(FH, '<' . $t_dataf))) {
62 while (!eof(FH)) {
63 $t_data .= <FH>
64 };
65 close(FH);
66}
67else {
68 die "Unable to load $t_dataf";
Akrond9627472020-07-09 16:53:09 +020069};
70
71my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
72my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron510a88c2020-07-07 10:16:50 +020073
Akron4f67cd42020-07-02 12:27:58 +020074
Akronaa229a22020-02-18 13:44:25 +010075# Add benchmark instances
76$bench->add_instances(
77 Dumbbench::Instance::PerlSub->new(
78 name => 'SimpleConversion',
79 code => sub {
80 `cat '$file' | perl '$script' > /dev/null 2>&1`
81 }
Akron4f67cd42020-07-02 12:27:58 +020082 ),
83 Dumbbench::Instance::PerlSub->new(
84 name => 'delHTMLcom',
85 code => sub {
86 for (1..100_000) {
87 $result = KorAP::XML::TEI::delHTMLcom(
88 \*STDIN,
89 "This <!-- comment --> is a test " . $_
90 );
91 };
92 }
93 ),
Akron2d547bc2020-07-04 10:34:35 +020094 Dumbbench::Instance::PerlSub->new(
95 name => 'delHTMLcom-long',
96 code => sub {
97 for (1..10_000) {
98 $result = KorAP::XML::TEI::delHTMLcom(
99 $fh,
100 "This <!--" . $_
101 );
102 seek($fh, 0, 0);
103 };
104 }
105 ),
Akron510a88c2020-07-07 10:16:50 +0200106 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200107 name => 'Tokenizer-conservative',
Akron510a88c2020-07-07 10:16:50 +0200108 code => sub {
Akrond9627472020-07-09 16:53:09 +0200109 $result = $cons_tok->reset->tokenize($t_data, 0);
Akron510a88c2020-07-07 10:16:50 +0200110 $result = 0;
111 }
112 ),
113 Dumbbench::Instance::PerlSub->new(
Akrond9627472020-07-09 16:53:09 +0200114 name => 'Tokenizer-aggressive',
Akron510a88c2020-07-07 10:16:50 +0200115 code => sub {
Akrond9627472020-07-09 16:53:09 +0200116 $result = $aggr_tok->reset->tokenize($t_data, 0);
Akron510a88c2020-07-07 10:16:50 +0200117 $result = 0;
118 }
119 ),
Akronaa229a22020-02-18 13:44:25 +0100120);
121
122# Run benchmarks
123$bench->run;
124
Akron2d547bc2020-07-04 10:34:35 +0200125# Clean up
126close($fh);
127
Akronaa229a22020-02-18 13:44:25 +0100128# Output in a single row
129if ($columns) {
130 unless ($no_header) {
131 print join("\t", map { $_->name } $bench->instances), "\n";
132 };
133 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
134 exit(0);
135};
136
137# Output simple timings for comparation
138foreach my $inst ($bench->instances) {
139 unless ($no_header) {
140 print $inst->name, ': ';
141 };
142 print $inst->result->raw_number, "\n";
143};
144
145exit(0);
146
147__END__