| #!/usr/bin/env perl |
| use strict; |
| use warnings; |
| use Dumbbench; |
| use File::Basename 'dirname'; |
| use File::Spec::Functions qw/catfile rel2abs/; |
| use File::Temp 'tempfile'; |
| use Encode qw!decode!; |
| use FindBin; |
| use Getopt::Long; |
| |
| BEGIN { |
| unshift @INC, "$FindBin::Bin/../lib"; |
| }; |
| |
| use Test::KorAP::XML::TEI qw!korap_tempfile!; |
| use KorAP::XML::TEI 'remove_xml_comments'; |
| use KorAP::XML::TEI::Tokenizer::Aggressive; |
| use KorAP::XML::TEI::Tokenizer::Conservative; |
| use KorAP::XML::TEI::Data; |
| |
| my $columns = 0; |
| my $no_header = 0; |
| GetOptions( |
| 'columns|c' => \$columns, |
| 'no-header|n' => \$no_header, |
| 'help|h' => sub { |
| print "--columns|-c Print instances in columns\n"; |
| print "--no-header|-n Dismiss benchmark names\n"; |
| print "--help|-h Print this page\n\n"; |
| exit(0); |
| } |
| ); |
| |
| our $SCRIPT_NAME = 'tei2korapxml'; |
| |
| my $f = dirname(__FILE__); |
| my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME)); |
| |
| # Load example files |
| my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml')); |
| my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml')); |
| |
| # Create a new benchmark object |
| my $bench = Dumbbench->new( |
| verbosity => 0 |
| ); |
| |
| my $result; |
| |
| # Data for delHTMLcom-long |
| my ($fh, $filename) = korap_tempfile('benchmark'); |
| |
| print $fh <<'HTML'; |
| mehrzeiliger |
| Kommentar |
| --><!-- Versuch |
| -->ist <!-- a --><!-- b --> ein Test |
| HTML |
| |
| # Data for Tokenization |
| # Test data |
| my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt'); |
| my $t_data = ''; |
| if ((open(FH, '<' . $t_dataf))) { |
| binmode(FH); |
| while (!eof(FH)) { |
| $t_data .= <FH> |
| }; |
| close(FH); |
| } |
| else { |
| die "Unable to load $t_dataf"; |
| }; |
| |
| my $t_data_utf_8 = decode('utf-8',$t_data); |
| my @t_data_split = split(' ', $t_data); |
| |
| my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new; |
| my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; |
| |
| my $data = KorAP::XML::TEI::Data->new; |
| |
| # Add benchmark instances |
| $bench->add_instances( |
| Dumbbench::Instance::PerlSub->new( |
| name => 'SimpleConversion', |
| code => sub { |
| `cat '$file' | perl '$script' -ti > /dev/null 2>&1` |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Conversion-with-inline-annotations', |
| code => sub { |
| `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1` |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'delHTMLcom', |
| code => sub { |
| for (1..100_000) { |
| $result = remove_xml_comments( |
| \*STDIN, |
| "This <!-- comment --> is a test " . $_ |
| ); |
| }; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'delHTMLcom-long', |
| code => sub { |
| for (1..10_000) { |
| $result = remove_xml_comments( |
| $fh, |
| "This <!--" . $_ |
| ); |
| seek($fh, 0, 0); |
| }; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Tokenizer-conservative', |
| code => sub { |
| $result = $cons_tok->reset->tokenize($t_data); |
| $result = 0; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Tokenizer-conservative-utf-8', |
| code => sub { |
| $result = $cons_tok->reset->tokenize($t_data_utf_8); |
| $result = 0; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Tokenizer-aggressive', |
| code => sub { |
| $result = $aggr_tok->reset->tokenize($t_data); |
| $result = 0; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Tokenizer-aggressive-utf-8', |
| code => sub { |
| $result = $aggr_tok->reset->tokenize($t_data_utf_8); |
| $result = 0; |
| } |
| ), |
| Dumbbench::Instance::PerlSub->new( |
| name => 'Data-Collect with serialization', |
| code => sub { |
| $data->reset->append($_) foreach @t_data_split; |
| $result = $data->to_string; |
| } |
| ) |
| ); |
| |
| # Run benchmarks |
| $bench->run; |
| |
| # Clean up |
| close($fh); |
| |
| # Output in a single row |
| if ($columns) { |
| unless ($no_header) { |
| print join("\t", map { $_->name } $bench->instances), "\n"; |
| }; |
| print join("\t", map { $_->result->raw_number } $bench->instances), "\n"; |
| exit(0); |
| }; |
| |
| # Output simple timings for comparation |
| foreach my $inst ($bench->instances) { |
| unless ($no_header) { |
| print $inst->name, ': '; |
| }; |
| print $inst->result->raw_number, "\n"; |
| }; |
| |
| exit(0); |
| |
| __END__ |