blob: f618717127ef0d4ee5bc6cbf2ea95a8363f62693 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Dumbbench;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile rel2abs/;
use File::Temp 'tempfile';
use Encode qw!decode!;
use FindBin;
use Getopt::Long;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
use Test::KorAP::XML::TEI qw!korap_tempfile!;
use KorAP::XML::TEI 'remove_xml_comments';
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Data;
my $columns = 0;
my $no_header = 0;
GetOptions(
'columns|c' => \$columns,
'no-header|n' => \$no_header,
'help|h' => sub {
print "--columns|-c Print instances in columns\n";
print "--no-header|-n Dismiss benchmark names\n";
print "--help|-h Print this page\n\n";
exit(0);
}
);
our $SCRIPT_NAME = 'tei2korapxml';
my $f = dirname(__FILE__);
my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
# Load example files
my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml'));
# Create a new benchmark object
my $bench = Dumbbench->new(
verbosity => 0
);
my $result;
# Data for delHTMLcom-long
my ($fh, $filename) = korap_tempfile('benchmark');
print $fh <<'HTML';
mehrzeiliger
Kommentar
--><!-- Versuch
-->ist <!-- a --><!-- b --> ein Test
HTML
# Data for Tokenization
# Test data
my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt');
my $t_data = '';
if ((open(FH, '<' . $t_dataf))) {
binmode(FH);
while (!eof(FH)) {
$t_data .= <FH>
};
close(FH);
}
else {
die "Unable to load $t_dataf";
};
my $t_data_utf_8 = decode('utf-8',$t_data);
my @t_data_split = split(' ', $t_data);
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $data = KorAP::XML::TEI::Data->new;
# Add benchmark instances
$bench->add_instances(
Dumbbench::Instance::PerlSub->new(
name => 'SimpleConversion',
code => sub {
`cat '$file' | perl '$script' -ti > /dev/null 2>&1`
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Conversion-with-inline-annotations',
code => sub {
`cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1`
}
),
Dumbbench::Instance::PerlSub->new(
name => 'delHTMLcom',
code => sub {
for (1..100_000) {
$result = remove_xml_comments(
\*STDIN,
"This <!-- comment --> is a test " . $_
);
};
}
),
Dumbbench::Instance::PerlSub->new(
name => 'delHTMLcom-long',
code => sub {
for (1..10_000) {
$result = remove_xml_comments(
$fh,
"This <!--" . $_
);
seek($fh, 0, 0);
};
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-conservative',
code => sub {
$result = $cons_tok->reset->tokenize($t_data);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-conservative-utf-8',
code => sub {
$result = $cons_tok->reset->tokenize($t_data_utf_8);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-aggressive',
code => sub {
$result = $aggr_tok->reset->tokenize($t_data);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Tokenizer-aggressive-utf-8',
code => sub {
$result = $aggr_tok->reset->tokenize($t_data_utf_8);
$result = 0;
}
),
Dumbbench::Instance::PerlSub->new(
name => 'Data-Collect with serialization',
code => sub {
$data->reset->append($_) foreach @t_data_split;
$result = $data->to_string;
}
)
);
# Run benchmarks
$bench->run;
# Clean up
close($fh);
# Output in a single row
if ($columns) {
unless ($no_header) {
print join("\t", map { $_->name } $bench->instances), "\n";
};
print join("\t", map { $_->result->raw_number } $bench->instances), "\n";
exit(0);
};
# Output simple timings for comparation
foreach my $inst ($bench->instances) {
unless ($no_header) {
print $inst->name, ': ';
};
print $inst->result->raw_number, "\n";
};
exit(0);
__END__