| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
|  | 2 | use strict; | 
|  | 3 | use warnings; | 
|  | 4 | use Dumbbench; | 
|  | 5 | use File::Basename 'dirname'; | 
|  | 6 | use File::Spec::Functions qw/catfile rel2abs/; | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 7 | use File::Temp 'tempfile'; | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 8 | use FindBin; | 
|  | 9 | use Getopt::Long; | 
|  | 10 |  | 
|  | 11 | BEGIN { | 
|  | 12 | unshift @INC, "$FindBin::Bin/../lib"; | 
|  | 13 | }; | 
|  | 14 |  | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 15 | use KorAP::XML::TEI; | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 16 | use KorAP::XML::TEI::Tokenizer::Aggressive; | 
|  | 17 | use KorAP::XML::TEI::Tokenizer::Conservative; | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 18 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 19 | my $columns = 0; | 
|  | 20 | my $no_header = 0; | 
|  | 21 | GetOptions( | 
|  | 22 | 'columns|c' => \$columns, | 
|  | 23 | 'no-header|n' => \$no_header, | 
|  | 24 | 'help|h' => sub { | 
|  | 25 | print "--columns|-c     Print instances in columns\n"; | 
|  | 26 | print "--no-header|-n   Dismiss benchmark names\n"; | 
|  | 27 | print "--help|-h        Print this page\n\n"; | 
|  | 28 | exit(0); | 
|  | 29 | } | 
|  | 30 | ); | 
|  | 31 |  | 
|  | 32 | our $SCRIPT_NAME = 'tei2korapxml'; | 
|  | 33 |  | 
|  | 34 | my $f = dirname(__FILE__); | 
|  | 35 | my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME)); | 
|  | 36 |  | 
|  | 37 | # Load example file | 
|  | 38 | my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml')); | 
|  | 39 |  | 
|  | 40 | # Create a new benchmark object | 
|  | 41 | my $bench = Dumbbench->new( | 
|  | 42 | verbosity => 0 | 
|  | 43 | ); | 
|  | 44 |  | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 45 | my $result; | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 46 |  | 
|  | 47 | # Data for delHTMLcom-long | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 48 | my ($fh, $filename) = tempfile(); | 
|  | 49 |  | 
|  | 50 | print $fh <<'HTML'; | 
|  | 51 | mehrzeiliger | 
|  | 52 | Kommentar | 
|  | 53 | --><!-- Versuch | 
|  | 54 | -->ist <!-- a --><!-- b --> ein Test | 
|  | 55 | HTML | 
|  | 56 |  | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 57 | # Data for Tokenization | 
|  | 58 | # Test data | 
|  | 59 | my $t_dataf = catfile(dirname(__FILE__), '..', 't', 'data', 'wikipedia.txt'); | 
|  | 60 | my $t_data = ''; | 
|  | 61 | if ((open(FH, '<' . $t_dataf))) { | 
|  | 62 | while (!eof(FH)) { | 
|  | 63 | $t_data .= <FH> | 
|  | 64 | }; | 
|  | 65 | close(FH); | 
|  | 66 | } | 
|  | 67 | else { | 
|  | 68 | die "Unable to load $t_dataf"; | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 69 | }; | 
|  | 70 |  | 
|  | 71 | my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new; | 
|  | 72 | my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new; | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 73 |  | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 74 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 75 | # Add benchmark instances | 
|  | 76 | $bench->add_instances( | 
|  | 77 | Dumbbench::Instance::PerlSub->new( | 
|  | 78 | name => 'SimpleConversion', | 
|  | 79 | code => sub { | 
|  | 80 | `cat '$file' | perl '$script' > /dev/null 2>&1` | 
|  | 81 | } | 
| Akron | 4f67cd4 | 2020-07-02 12:27:58 +0200 | [diff] [blame] | 82 | ), | 
|  | 83 | Dumbbench::Instance::PerlSub->new( | 
|  | 84 | name => 'delHTMLcom', | 
|  | 85 | code => sub { | 
|  | 86 | for (1..100_000) { | 
|  | 87 | $result = KorAP::XML::TEI::delHTMLcom( | 
|  | 88 | \*STDIN, | 
|  | 89 | "This <!-- comment --> is a test " . $_ | 
|  | 90 | ); | 
|  | 91 | }; | 
|  | 92 | } | 
|  | 93 | ), | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 94 | Dumbbench::Instance::PerlSub->new( | 
|  | 95 | name => 'delHTMLcom-long', | 
|  | 96 | code => sub { | 
|  | 97 | for (1..10_000) { | 
|  | 98 | $result = KorAP::XML::TEI::delHTMLcom( | 
|  | 99 | $fh, | 
|  | 100 | "This <!--" . $_ | 
|  | 101 | ); | 
|  | 102 | seek($fh, 0, 0); | 
|  | 103 | }; | 
|  | 104 | } | 
|  | 105 | ), | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 106 | Dumbbench::Instance::PerlSub->new( | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 107 | name => 'Tokenizer-conservative', | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 108 | code => sub { | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 109 | $result = $cons_tok->reset->tokenize($t_data, 0); | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 110 | $result = 0; | 
|  | 111 | } | 
|  | 112 | ), | 
|  | 113 | Dumbbench::Instance::PerlSub->new( | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 114 | name => 'Tokenizer-aggressive', | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 115 | code => sub { | 
| Akron | d962747 | 2020-07-09 16:53:09 +0200 | [diff] [blame] | 116 | $result = $aggr_tok->reset->tokenize($t_data, 0); | 
| Akron | 510a88c | 2020-07-07 10:16:50 +0200 | [diff] [blame] | 117 | $result = 0; | 
|  | 118 | } | 
|  | 119 | ), | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 120 | ); | 
|  | 121 |  | 
|  | 122 | # Run benchmarks | 
|  | 123 | $bench->run; | 
|  | 124 |  | 
| Akron | 2d547bc | 2020-07-04 10:34:35 +0200 | [diff] [blame] | 125 | # Clean up | 
|  | 126 | close($fh); | 
|  | 127 |  | 
| Akron | aa229a2 | 2020-02-18 13:44:25 +0100 | [diff] [blame] | 128 | # Output in a single row | 
|  | 129 | if ($columns) { | 
|  | 130 | unless ($no_header) { | 
|  | 131 | print join("\t", map { $_->name } $bench->instances), "\n"; | 
|  | 132 | }; | 
|  | 133 | print join("\t", map { $_->result->raw_number } $bench->instances), "\n"; | 
|  | 134 | exit(0); | 
|  | 135 | }; | 
|  | 136 |  | 
|  | 137 | # Output simple timings for comparation | 
|  | 138 | foreach my $inst ($bench->instances) { | 
|  | 139 | unless ($no_header) { | 
|  | 140 | print $inst->name, ': '; | 
|  | 141 | }; | 
|  | 142 | print $inst->result->raw_number, "\n"; | 
|  | 143 | }; | 
|  | 144 |  | 
|  | 145 | exit(0); | 
|  | 146 |  | 
|  | 147 | __END__ |