| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use Dumbbench; | 
 | 5 | use File::Basename 'dirname'; | 
 | 6 | use File::Spec::Functions qw/catfile rel2abs/; | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 7 | use File::Temp 'tempfile'; | 
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 8 | use Encode qw!decode!; | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 9 | use FindBin; | 
 | 10 | use Getopt::Long; | 
 | 11 |  | 
 | 12 | BEGIN { | 
 | 13 |   unshift @INC, "$FindBin::Bin/../lib"; | 
 | 14 | }; | 
 | 15 |  | 
| Peter Harders | 42e18a6 | 2020-07-21 02:43:26 +0200 | [diff] [blame] | 16 | use Test::KorAP::XML::TEI qw!korap_tempfile!; | 
| Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 17 | use KorAP::XML::TEI 'remove_xml_comments'; | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 18 | use KorAP::XML::TEI::Tokenizer::Aggressive; | 
 | 19 | use KorAP::XML::TEI::Tokenizer::Conservative; | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 20 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 21 | my $columns = 0; | 
 | 22 | my $no_header = 0; | 
 | 23 | GetOptions( | 
 | 24 |   'columns|c' => \$columns, | 
 | 25 |   'no-header|n' => \$no_header, | 
 | 26 |   'help|h' => sub { | 
 | 27 |     print "--columns|-c     Print instances in columns\n"; | 
 | 28 |     print "--no-header|-n   Dismiss benchmark names\n"; | 
 | 29 |     print "--help|-h        Print this page\n\n"; | 
 | 30 |     exit(0); | 
 | 31 |   } | 
 | 32 | ); | 
 | 33 |  | 
 | 34 | our $SCRIPT_NAME = 'tei2korapxml'; | 
 | 35 |  | 
 | 36 | my $f = dirname(__FILE__); | 
 | 37 | my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME)); | 
 | 38 |  | 
 | 39 | # Load example file | 
 | 40 | my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml')); | 
 | 41 |  | 
 | 42 | # Create a new benchmark object | 
 | 43 | my $bench = Dumbbench->new( | 
 | 44 |   verbosity => 0 | 
 | 45 | ); | 
 | 46 |  | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 47 | my $result; | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 48 |  | 
 | 49 | # Data for delHTMLcom-long | 
| Peter Harders | 42e18a6 | 2020-07-21 02:43:26 +0200 | [diff] [blame] | 50 | my ($fh, $filename) = korap_tempfile('benchmark'); | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 51 |  | 
 | 52 | print $fh <<'HTML'; | 
 | 53 | mehrzeiliger | 
 | 54 | Kommentar | 
 | 55 |   --><!-- Versuch | 
 | 56 | -->ist <!-- a --><!-- b --> ein Test | 
 | 57 | HTML | 
 | 58 |  | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 59 | # Data for Tokenization | 
 | 60 | # Test data | 
 | 61 | my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt'); | 
 | 62 | my $t_data = ''; | 
 | 63 | if ((open(FH, '<' . $t_dataf))) { | 
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 64 |   binmode(FH); | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 65 |   while (!eof(FH)) { | 
 | 66 |     $t_data .= <FH> | 
 | 67 |   }; | 
 | 68 |   close(FH); | 
 | 69 | } | 
 | 70 | else { | 
 | 71 |   die "Unable to load $t_dataf"; | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 72 | }; | 
 | 73 |  | 
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 74 | my $t_data_utf_8 = decode('utf-8',$t_data); | 
 | 75 |  | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 76 | my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new; | 
 | 77 | my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 78 |  | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 79 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 80 | # Add benchmark instances | 
 | 81 | $bench->add_instances( | 
 | 82 |   Dumbbench::Instance::PerlSub->new( | 
 | 83 |     name => 'SimpleConversion', | 
 | 84 |     code => sub { | 
| Peter Harders | f9c5124 | 2020-07-21 02:37:44 +0200 | [diff] [blame] | 85 |       `cat '$file' | perl '$script' -ti > /dev/null 2>&1` | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 86 |     } | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 87 |   ), | 
 | 88 |   Dumbbench::Instance::PerlSub->new( | 
 | 89 |     name => 'delHTMLcom', | 
 | 90 |     code => sub { | 
 | 91 |       for (1..100_000) { | 
| Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 92 |         $result = remove_xml_comments( | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 93 |           \*STDIN, | 
 | 94 |           "This <!-- comment --> is a test " . $_ | 
 | 95 |         ); | 
 | 96 |       }; | 
 | 97 |     } | 
 | 98 |   ), | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 99 |   Dumbbench::Instance::PerlSub->new( | 
 | 100 |     name => 'delHTMLcom-long', | 
 | 101 |     code => sub { | 
 | 102 |       for (1..10_000) { | 
| Akron | 95bc98a | 2020-07-11 12:00:12 +0200 | [diff] [blame] | 103 |         $result = remove_xml_comments( | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 104 |           $fh, | 
 | 105 |           "This <!--" . $_ | 
 | 106 |         ); | 
 | 107 |         seek($fh, 0, 0); | 
 | 108 |       }; | 
 | 109 |     } | 
 | 110 |   ), | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 111 |   Dumbbench::Instance::PerlSub->new( | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 112 |     name => 'Tokenizer-conservative', | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 113 |     code => sub { | 
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 114 |       $result = $cons_tok->reset->tokenize($t_data); | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 115 |       $result = 0; | 
 | 116 |     } | 
 | 117 |   ), | 
 | 118 |   Dumbbench::Instance::PerlSub->new( | 
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 119 |     name => 'Tokenizer-conservative-utf-8', | 
 | 120 |     code => sub { | 
 | 121 |       $result = $cons_tok->reset->tokenize($t_data_utf_8); | 
 | 122 |       $result = 0; | 
 | 123 |     } | 
 | 124 |   ), | 
 | 125 |   Dumbbench::Instance::PerlSub->new( | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 126 |     name => 'Tokenizer-aggressive', | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 127 |     code => sub { | 
| Peter Harders | b122717 | 2020-07-21 02:12:10 +0200 | [diff] [blame] | 128 |       $result = $aggr_tok->reset->tokenize($t_data); | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 129 |       $result = 0; | 
 | 130 |     } | 
 | 131 |   ), | 
| Peter Harders | 994aff7 | 2020-07-25 09:53:35 +0200 | [diff] [blame] | 132 |   Dumbbench::Instance::PerlSub->new( | 
 | 133 |     name => 'Tokenizer-aggressive-utf-8', | 
 | 134 |     code => sub { | 
 | 135 |       $result = $aggr_tok->reset->tokenize($t_data_utf_8); | 
 | 136 |       $result = 0; | 
 | 137 |     } | 
 | 138 |   ) | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 139 | ); | 
 | 140 |  | 
 | 141 | # Run benchmarks | 
 | 142 | $bench->run; | 
 | 143 |  | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 144 | # Clean up | 
 | 145 | close($fh); | 
 | 146 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 147 | # Output in a single row | 
 | 148 | if ($columns) { | 
 | 149 |   unless ($no_header) { | 
 | 150 |     print join("\t", map { $_->name } $bench->instances), "\n"; | 
 | 151 |   }; | 
 | 152 |   print join("\t", map { $_->result->raw_number } $bench->instances), "\n"; | 
 | 153 |   exit(0); | 
 | 154 | }; | 
 | 155 |  | 
 | 156 | # Output simple timings for comparation | 
 | 157 | foreach my $inst ($bench->instances) { | 
 | 158 |   unless ($no_header) { | 
 | 159 |     print $inst->name, ': '; | 
 | 160 |   }; | 
 | 161 |   print $inst->result->raw_number, "\n"; | 
 | 162 | }; | 
 | 163 |  | 
 | 164 | exit(0); | 
 | 165 |  | 
 | 166 | __END__ |