Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use utf8; |
| 5 | use Log::Log4perl; |
| 6 | use Data::Dumper; |
| 7 | |
| 8 | Log::Log4perl->init({ |
| 9 | 'log4perl.rootLogger' => 'ERROR, STDERR', |
| 10 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 11 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 12 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 13 | }); |
| 14 | |
| 15 | use File::Basename 'dirname'; |
| 16 | use File::Spec::Functions qw/catdir catfile/; |
| 17 | use Test::More; |
| 18 | use Scalar::Util qw/weaken/; |
| 19 | use Data::Dumper; |
| 20 | use lib 't/annotation'; |
| 21 | use File::Temp qw/tempdir/; |
| 22 | |
Nils Diewald | b3e9ccd | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 23 | use KorAP::XML::Archive; |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 24 | |
| 25 | my $name = 'wpd15-single'; |
| 26 | my @path = (dirname(__FILE__), '..', 'corpus','archives'); |
| 27 | |
| 28 | my $file = catfile(@path, $name . '.zip'); |
Nils Diewald | b3e9ccd | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 29 | my $archive = KorAP::XML::Archive->new($file); |
| 30 | |
| 31 | unless ($archive->test_unzip) { |
| 32 | plan skip_all => 'unzip not found'; |
| 33 | }; |
| 34 | |
| 35 | use_ok('KorAP::XML::Annotation::MDParser::Dependency'); |
| 36 | use_ok('KorAP::XML::Krill'); |
| 37 | use_ok('KorAP::XML::Tokenizer'); |
| 38 | |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 39 | |
| 40 | ok($archive->attach('#' . catfile(@path, $name . '.mdparser.zip')), 'Attach mdparser archive'); |
| 41 | |
| 42 | my $dir = tempdir(); |
| 43 | |
| 44 | my $f_path = 'WPD15/A00/00081'; |
Akron | 2080758 | 2016-10-26 17:11:34 +0200 | [diff] [blame] | 45 | $archive->extract_text($f_path, $dir); |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 46 | |
| 47 | ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path)); |
| 48 | |
| 49 | ok($doc->parse, 'Krill parser works'); |
| 50 | |
| 51 | my $tokens = KorAP::XML::Tokenizer->new( |
| 52 | path => $doc->path, |
| 53 | doc => $doc, |
| 54 | foundry => 'Base', |
| 55 | layer => 'Tokens', |
| 56 | name => 'tokens' |
| 57 | ) or return; |
| 58 | |
| 59 | $tokens->parse or return; |
| 60 | |
| 61 | ok($tokens->add('MDParser', 'Dependency'), 'Add Dependency'); |
| 62 | |
| 63 | my $data = $tokens->to_data->{data}; |
| 64 | |
| 65 | |
| 66 | is($data->{tokenSource}, 'base#tokens', 'TokenSource'); |
| 67 | like($data->{foundries}, qr!mdparser/dependency!, 'foundries'); |
| 68 | like($data->{layerInfos}, qr!mdp/d=rels!, 'foundries'); |
| 69 | |
| 70 | my $stream = $data->{stream}; |
| 71 | |
Akron | a86d94a | 2016-07-06 14:23:12 +0200 | [diff] [blame] | 72 | is($stream->[0]->[0], '-:tokens$<i>3555', 'Token count'); |
| 73 | |
Akron | 918ce42 | 2017-06-16 20:28:43 +0200 | [diff] [blame^] | 74 | is($stream->[-1]->[-1], 's:978-3-89487-607-4', 'Last token'); |
| 75 | |
Akron | a86d94a | 2016-07-06 14:23:12 +0200 | [diff] [blame] | 76 | # Term-to-term |
| 77 | is($stream->[0]->[1], '<:mdp/d:NMOD$<b>32<i>5', 'Term-to-Term'); |
| 78 | is($stream->[5]->[0], '>:mdp/d:NMOD$<b>32<i>0', 'Term-to-Term'); |
| 79 | |
| 80 | # Element-to-term |
| 81 | is($stream->[0]->[8], '<:mdp/d:ROOT$<b>34<i>0<i>317<i>40<i>0', 'Element-to-Term'); |
| 82 | is($stream->[0]->[10], '>:mdp/d:ROOT$<b>33<i>0<i>317<i>0<i>40', 'Term-to-Element'); |
| 83 | |
| 84 | |
| 85 | is($stream->[-1]->[0], '>:mdp/d:ROOT$<b>33<i>26130<i>26153<i>3553<i>3554', 'Term-to-Element'); |
| 86 | is($stream->[3553]->[1], '<:mdp/d:ROOT$<b>34<i>26130<i>26153<i>3554<i>3553', 'Element-to-Term'); |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 87 | |
| 88 | done_testing; |
| 89 | __END__ |