Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | # source ~/perl5/perlbrew/etc/bashrc |
| 3 | # perlbrew switch perl-blead@korap |
| 4 | use strict; |
| 5 | use warnings; |
Nils Diewald | a0e8d72 | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 6 | use Mojo::ByteStream 'b'; |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 7 | use Devel::Cycle; |
| 8 | use Memory::Stats; |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 9 | |
| 10 | use Benchmark qw/:hireswallclock/; |
| 11 | |
| 12 | my $t = Benchmark->new; |
| 13 | |
| 14 | use utf8; |
| 15 | use lib 'lib', '../lib'; |
| 16 | |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 17 | use File::Basename 'dirname'; |
| 18 | use File::Spec::Functions 'catdir'; |
| 19 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 20 | # Tokenization |
| 21 | use KorAP::Tokenizer; |
| 22 | use KorAP::Document; |
| 23 | |
| 24 | # my $stats = Memory::Stats->new; |
| 25 | |
| 26 | #$stats->start; |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 27 | |
| 28 | # GOE/AGA/03828 |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 29 | #my $path = catdir(dirname(__FILE__), 'GOE/AGA/03828'); |
| 30 | my $path = catdir(dirname(__FILE__), 'BZK/D59/00089'); |
Nils Diewald | a0e8d72 | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 31 | # Todo: Test with absolute path! |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 32 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 33 | # do something |
| 34 | #$stats->checkpoint(sprintf("%20s", "Init")); |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 35 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 36 | my $doc = KorAP::Document->new( path => $path . '/' ); |
| 37 | $doc->parse; |
| 38 | # $stats->checkpoint(sprintf("%20s", "After Parsing")); |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 39 | |
| 40 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 41 | |
| 42 | # Get tokenization |
| 43 | my $tokens = KorAP::Tokenizer->new( |
| 44 | path => $doc->path, |
| 45 | doc => $doc, |
| 46 | foundry => $token_base_foundry, |
| 47 | layer => $token_base_layer, |
| 48 | name => 'tokens' |
| 49 | ); |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 50 | $tokens->parse; |
| 51 | #$stats->checkpoint(sprintf("%20s", "After Tokenization")); |
Nils Diewald | 24b0446 | 2014-11-01 00:16:38 +0000 | [diff] [blame] | 52 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 53 | $tokens->add('Base', 'Sentences'); |
| 54 | #$stats->checkpoint(sprintf("%20s", "After Base/Sentences")); |
| 55 | |
| 56 | $tokens->add('Base', 'Paragraphs'); |
| 57 | #$stats->checkpoint(sprintf("%20s", "After Base/Paragraphs")); |
| 58 | |
| 59 | $tokens->add('OpenNLP', 'Sentences'); |
| 60 | #$stats->checkpoint(sprintf("%20s", "After OpenNLP/Sentences")); |
| 61 | |
| 62 | $tokens->add('OpenNLP', 'Morpho'); |
| 63 | #$stats->checkpoint(sprintf("%20s", "After OpenNLP/Morpho")); |
| 64 | |
| 65 | $tokens->add('TreeTagger', 'Sentences'); |
| 66 | #$stats->checkpoint(sprintf("%20s", "After TT/Sentences")); |
| 67 | |
| 68 | $tokens->add('TreeTagger', 'Morpho'); |
| 69 | #$stats->checkpoint(sprintf("%20s", "After TT/Morpho")); |
| 70 | |
| 71 | $tokens->add('CoreNLP', 'Sentences'); |
| 72 | #$stats->checkpoint(sprintf("%20s", "After CoreNLP/Sentences")); |
| 73 | |
| 74 | $tokens->add('CoreNLP', 'Constituency'); |
| 75 | #$stats->checkpoint(sprintf("%20s", "After CoreNLP/Constituency")); |
| 76 | |
| 77 | #$stats->stop; |
| 78 | #$stats->report; |
| 79 | |
| 80 | $tokens->add('CoreNLP', 'NamedEntities'); |
| 81 | $tokens->add('CoreNLP', 'Morpho'); |
| 82 | $tokens->add('Glemm', 'Morpho'); |
Nils Diewald | a0e8d72 | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 83 | # t ok($tokens->add('Connexor', 'Sentences'), 'Add cnx sentences'); |
| 84 | # t ok($tokens->add('Connexor', 'Morpho'), 'Add cnx morpho'); |
| 85 | # t ok($tokens->add('Connexor', 'Phrase'), 'Add cnx phrase'); |
| 86 | # t ok($tokens->add('Connexor', 'Syntax'), 'Add cnx syntax'); |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 87 | $tokens->add('Mate', 'Morpho'); |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 88 | # $tokens->add('Mate', 'Dependency'); |
Nils Diewald | a0e8d72 | 2014-11-01 01:18:25 +0000 | [diff] [blame] | 89 | # t ok($tokens->add('XIP', 'Sentences'), 'Add xip sentences'); |
| 90 | # t ok($tokens->add('XIP', 'Morpho'), 'Add xip morpho'); |
| 91 | # t ok($tokens->add('XIP', 'Constituency'), 'Add xip constituency'); |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 92 | # $tokens->add('XIP', 'Dependency'); |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 93 | # ok($tokens->to_json, 'To json'); |
Nils Diewald | 90a23f2 | 2014-10-31 02:16:14 +0000 | [diff] [blame] | 94 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 95 | #b($tokens->to_json)->spurt('AGA-03828.json'); |
| 96 | b($tokens->to_json)->spurt('D59-00089.json'); |
Nils Diewald | 24b0446 | 2014-11-01 00:16:38 +0000 | [diff] [blame] | 97 | |
Nils Diewald | c95607a | 2014-11-03 21:04:05 +0000 | [diff] [blame^] | 98 | # timestr(timediff(Benchmark->new, $t)); |