Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use Dumbbench; |
| 5 | use File::Basename 'dirname'; |
| 6 | use File::Spec::Functions qw/catfile rel2abs/; |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 7 | use File::Temp 'tempfile'; |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 8 | use Encode qw!decode!; |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 9 | use FindBin; |
| 10 | use Getopt::Long; |
| 11 | |
| 12 | BEGIN { |
| 13 | unshift @INC, "$FindBin::Bin/../lib"; |
| 14 | }; |
| 15 | |
Peter Harders | 42e18a6 | 2020-07-21 02:43:26 +0200 | [diff] [blame] | 16 | use Test::KorAP::XML::TEI qw!korap_tempfile!; |
Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 17 | use KorAP::XML::TEI 'remove_xml_comments'; |
Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 18 | use KorAP::XML::TEI::Tokenizer::Aggressive; |
| 19 | use KorAP::XML::TEI::Tokenizer::Conservative; |
Akron | a10ad59 | 2020-08-03 11:20:23 +0200 | [diff] [blame^] | 20 | use KorAP::XML::TEI::Data; |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 21 | |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 22 | my $columns = 0; |
| 23 | my $no_header = 0; |
| 24 | GetOptions( |
| 25 | 'columns|c' => \$columns, |
| 26 | 'no-header|n' => \$no_header, |
| 27 | 'help|h' => sub { |
| 28 | print "--columns|-c Print instances in columns\n"; |
| 29 | print "--no-header|-n Dismiss benchmark names\n"; |
| 30 | print "--help|-h Print this page\n\n"; |
| 31 | exit(0); |
| 32 | } |
| 33 | ); |
| 34 | |
| 35 | our $SCRIPT_NAME = 'tei2korapxml'; |
| 36 | |
| 37 | my $f = dirname(__FILE__); |
| 38 | my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME)); |
| 39 | |
Akron | e68ec0c | 2020-07-28 18:06:19 +0200 | [diff] [blame] | 40 | # Load example files |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 41 | my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml')); |
Akron | e68ec0c | 2020-07-28 18:06:19 +0200 | [diff] [blame] | 42 | my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml')); |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 43 | |
| 44 | # Create a new benchmark object |
| 45 | my $bench = Dumbbench->new( |
| 46 | verbosity => 0 |
| 47 | ); |
| 48 | |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 49 | my $result; |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 50 | |
| 51 | # Data for delHTMLcom-long |
Peter Harders | 42e18a6 | 2020-07-21 02:43:26 +0200 | [diff] [blame] | 52 | my ($fh, $filename) = korap_tempfile('benchmark'); |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 53 | |
| 54 | print $fh <<'HTML'; |
| 55 | mehrzeiliger |
| 56 | Kommentar |
| 57 | --><!-- Versuch |
| 58 | -->ist <!-- a --><!-- b --> ein Test |
| 59 | HTML |
| 60 | |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 61 | # Data for Tokenization |
| 62 | # Test data |
| 63 | my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt'); |
| 64 | my $t_data = ''; |
| 65 | if ((open(FH, '<' . $t_dataf))) { |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 66 | binmode(FH); |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 67 | while (!eof(FH)) { |
| 68 | $t_data .= <FH> |
| 69 | }; |
| 70 | close(FH); |
| 71 | } |
| 72 | else { |
| 73 | die "Unable to load $t_dataf"; |
Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 74 | }; |
| 75 | |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 76 | my $t_data_utf_8 = decode('utf-8',$t_data); |
Akron | a10ad59 | 2020-08-03 11:20:23 +0200 | [diff] [blame^] | 77 | my @t_data_split = split(' ', $t_data); |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 78 | |
Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 79 | my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new; |
| 80 | my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 81 | |
Akron | a10ad59 | 2020-08-03 11:20:23 +0200 | [diff] [blame^] | 82 | my $data = KorAP::XML::TEI::Data->new; |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 83 | |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 84 | # Add benchmark instances |
| 85 | $bench->add_instances( |
| 86 | Dumbbench::Instance::PerlSub->new( |
| 87 | name => 'SimpleConversion', |
| 88 | code => sub { |
Peter Harders | f9c5124 | 2020-07-21 02:37:44 +0200 | [diff] [blame] | 89 | `cat '$file' | perl '$script' -ti > /dev/null 2>&1` |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 90 | } |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 91 | ), |
| 92 | Dumbbench::Instance::PerlSub->new( |
Akron | e68ec0c | 2020-07-28 18:06:19 +0200 | [diff] [blame] | 93 | name => 'Conversion-with-inline-annotations', |
| 94 | code => sub { |
| 95 | `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1` |
| 96 | } |
| 97 | ), |
| 98 | Dumbbench::Instance::PerlSub->new( |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 99 | name => 'delHTMLcom', |
| 100 | code => sub { |
| 101 | for (1..100_000) { |
Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 102 | $result = remove_xml_comments( |
Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 103 | \*STDIN, |
| 104 | "This <!-- comment --> is a test " . $_ |
| 105 | ); |
| 106 | }; |
| 107 | } |
| 108 | ), |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 109 | Dumbbench::Instance::PerlSub->new( |
| 110 | name => 'delHTMLcom-long', |
| 111 | code => sub { |
| 112 | for (1..10_000) { |
Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 113 | $result = remove_xml_comments( |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 114 | $fh, |
| 115 | "This <!--" . $_ |
| 116 | ); |
| 117 | seek($fh, 0, 0); |
| 118 | }; |
| 119 | } |
| 120 | ), |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 121 | Dumbbench::Instance::PerlSub->new( |
Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 122 | name => 'Tokenizer-conservative', |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 123 | code => sub { |
Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 124 | $result = $cons_tok->reset->tokenize($t_data); |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 125 | $result = 0; |
| 126 | } |
| 127 | ), |
| 128 | Dumbbench::Instance::PerlSub->new( |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 129 | name => 'Tokenizer-conservative-utf-8', |
| 130 | code => sub { |
| 131 | $result = $cons_tok->reset->tokenize($t_data_utf_8); |
| 132 | $result = 0; |
| 133 | } |
| 134 | ), |
| 135 | Dumbbench::Instance::PerlSub->new( |
Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 136 | name => 'Tokenizer-aggressive', |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 137 | code => sub { |
Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 138 | $result = $aggr_tok->reset->tokenize($t_data); |
Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 139 | $result = 0; |
| 140 | } |
| 141 | ), |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 142 | Dumbbench::Instance::PerlSub->new( |
| 143 | name => 'Tokenizer-aggressive-utf-8', |
| 144 | code => sub { |
| 145 | $result = $aggr_tok->reset->tokenize($t_data_utf_8); |
| 146 | $result = 0; |
| 147 | } |
Akron | a10ad59 | 2020-08-03 11:20:23 +0200 | [diff] [blame^] | 148 | ), |
| 149 | Dumbbench::Instance::PerlSub->new( |
| 150 | name => 'Data-Collect with serialization', |
| 151 | code => sub { |
| 152 | $data->reset->append($_) foreach @t_data_split; |
| 153 | $result = $data->to_string; |
| 154 | } |
Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 155 | ) |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 156 | ); |
| 157 | |
| 158 | # Run benchmarks |
| 159 | $bench->run; |
| 160 | |
Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 161 | # Clean up |
| 162 | close($fh); |
| 163 | |
Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 164 | # Output in a single row |
| 165 | if ($columns) { |
| 166 | unless ($no_header) { |
| 167 | print join("\t", map { $_->name } $bench->instances), "\n"; |
| 168 | }; |
| 169 | print join("\t", map { $_->result->raw_number } $bench->instances), "\n"; |
| 170 | exit(0); |
| 171 | }; |
| 172 | |
| 173 | # Output simple timings for comparation |
| 174 | foreach my $inst ($bench->instances) { |
| 175 | unless ($no_header) { |
| 176 | print $inst->name, ': '; |
| 177 | }; |
| 178 | print $inst->result->raw_number, "\n"; |
| 179 | }; |
| 180 | |
| 181 | exit(0); |
| 182 | |
| 183 | __END__ |