blob: 85effd8d62e1401860cc32a051f2d95735e442c1 [file] [log] [blame]
Akronaa229a22020-02-18 13:44:25 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use Dumbbench;
5use File::Basename 'dirname';
6use File::Spec::Functions qw/catfile rel2abs/;
Akron2d547bc2020-07-04 10:34:35 +02007use File::Temp 'tempfile';
Akronaa229a22020-02-18 13:44:25 +01008use FindBin;
9use Getopt::Long;
10
11BEGIN {
12 unshift @INC, "$FindBin::Bin/../lib";
13};
14
Akron4f67cd42020-07-02 12:27:58 +020015use KorAP::XML::TEI;
Akron510a88c2020-07-07 10:16:50 +020016use KorAP::XML::TEI::Tokenization;
Akron4f67cd42020-07-02 12:27:58 +020017
Akronaa229a22020-02-18 13:44:25 +010018my $columns = 0;
19my $no_header = 0;
20GetOptions(
21 'columns|c' => \$columns,
22 'no-header|n' => \$no_header,
23 'help|h' => sub {
24 print "--columns|-c Print instances in columns\n";
25 print "--no-header|-n Dismiss benchmark names\n";
26 print "--help|-h Print this page\n\n";
27 exit(0);
28 }
29);
30
31our $SCRIPT_NAME = 'tei2korapxml';
32
33my $f = dirname(__FILE__);
34my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
35
36# Load example file
37my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
38
39# Create a new benchmark object
40my $bench = Dumbbench->new(
41 verbosity => 0
42);
43
Akron4f67cd42020-07-02 12:27:58 +020044my $result;
Akron510a88c2020-07-07 10:16:50 +020045
46# Data for delHTMLcom-long
Akron2d547bc2020-07-04 10:34:35 +020047my ($fh, $filename) = tempfile();
48
49print $fh <<'HTML';
50mehrzeiliger
51Kommentar
52 --><!-- Versuch
53-->ist <!-- a --><!-- b --> ein Test
54HTML
55
Akron510a88c2020-07-07 10:16:50 +020056# Data for Tokenization
57# Test data
58my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
59my $t_data = '';
60if ((open(FH, '<' . $t_dataf))) {
61 while (!eof(FH)) {
62 $t_data .= <FH>
63 };
64 close(FH);
65}
66else {
67 die "Unable to load $t_dataf";
68}
69
Akron4f67cd42020-07-02 12:27:58 +020070
Akronaa229a22020-02-18 13:44:25 +010071# Add benchmark instances
72$bench->add_instances(
73 Dumbbench::Instance::PerlSub->new(
74 name => 'SimpleConversion',
75 code => sub {
76 `cat '$file' | perl '$script' > /dev/null 2>&1`
77 }
Akron4f67cd42020-07-02 12:27:58 +020078 ),
79 Dumbbench::Instance::PerlSub->new(
80 name => 'delHTMLcom',
81 code => sub {
82 for (1..100_000) {
83 $result = KorAP::XML::TEI::delHTMLcom(
84 \*STDIN,
85 "This <!-- comment --> is a test " . $_
86 );
87 };
88 }
89 ),
Akron2d547bc2020-07-04 10:34:35 +020090 Dumbbench::Instance::PerlSub->new(
91 name => 'delHTMLcom-long',
92 code => sub {
93 for (1..10_000) {
94 $result = KorAP::XML::TEI::delHTMLcom(
95 $fh,
96 "This <!--" . $_
97 );
98 seek($fh, 0, 0);
99 };
100 }
101 ),
Akron510a88c2020-07-07 10:16:50 +0200102 Dumbbench::Instance::PerlSub->new(
103 name => 'Tokenization-conservative',
104 code => sub {
105 $result = KorAP::XML::TEI::Tokenization::conservative($t_data, 0);
106 $result = 0;
107 }
108 ),
109 Dumbbench::Instance::PerlSub->new(
110 name => 'Tokenization-aggressive',
111 code => sub {
112 $result = KorAP::XML::TEI::Tokenization::aggressive($t_data, 0);
113 $result = 0;
114 }
115 ),
Akronaa229a22020-02-18 13:44:25 +0100116);
117
118# Run benchmarks
119$bench->run;
120
Akron2d547bc2020-07-04 10:34:35 +0200121# Clean up
122close($fh);
123
Akronaa229a22020-02-18 13:44:25 +0100124# Output in a single row
125if ($columns) {
126 unless ($no_header) {
127 print join("\t", map { $_->name } $bench->instances), "\n";
128 };
129 print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
130 exit(0);
131};
132
133# Output simple timings for comparation
134foreach my $inst ($bench->instances) {
135 unless ($no_header) {
136 print $inst->name, ': ';
137 };
138 print $inst->result->raw_number, "\n";
139};
140
141exit(0);
142
143__END__