Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use utf8; |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 5 | use Data::Dumper; |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 6 | use File::Basename 'dirname'; |
| 7 | use File::Spec::Functions qw/catdir catfile/; |
| 8 | use Test::More; |
| 9 | use Scalar::Util qw/weaken/; |
| 10 | use Data::Dumper; |
| 11 | use lib 't/annotation'; |
| 12 | use File::Temp qw/tempdir/; |
| 13 | |
Nils Diewald | b3e9ccd | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 14 | use KorAP::XML::Archive; |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 15 | |
| 16 | my $name = 'wpd15-single'; |
| 17 | my @path = (dirname(__FILE__), '..', 'corpus','archives'); |
| 18 | |
| 19 | my $file = catfile(@path, $name . '.zip'); |
Nils Diewald | b3e9ccd | 2016-10-24 15:16:52 +0200 | [diff] [blame] | 20 | my $archive = KorAP::XML::Archive->new($file); |
| 21 | |
| 22 | unless ($archive->test_unzip) { |
| 23 | plan skip_all => 'unzip not found'; |
| 24 | }; |
| 25 | |
| 26 | use_ok('KorAP::XML::Annotation::MDParser::Dependency'); |
| 27 | use_ok('KorAP::XML::Krill'); |
| 28 | use_ok('KorAP::XML::Tokenizer'); |
| 29 | |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 30 | |
| 31 | ok($archive->attach('#' . catfile(@path, $name . '.mdparser.zip')), 'Attach mdparser archive'); |
| 32 | |
| 33 | my $dir = tempdir(); |
| 34 | |
| 35 | my $f_path = 'WPD15/A00/00081'; |
Akron | a351837 | 2024-01-22 23:29:00 +0100 | [diff] [blame] | 36 | $archive->extract_sigle(0, [$f_path], $dir); |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 37 | |
| 38 | ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path)); |
| 39 | |
| 40 | ok($doc->parse, 'Krill parser works'); |
| 41 | |
| 42 | my $tokens = KorAP::XML::Tokenizer->new( |
| 43 | path => $doc->path, |
| 44 | doc => $doc, |
| 45 | foundry => 'Base', |
| 46 | layer => 'Tokens', |
| 47 | name => 'tokens' |
| 48 | ) or return; |
| 49 | |
| 50 | $tokens->parse or return; |
| 51 | |
| 52 | ok($tokens->add('MDParser', 'Dependency'), 'Add Dependency'); |
| 53 | |
| 54 | my $data = $tokens->to_data->{data}; |
| 55 | |
| 56 | |
| 57 | is($data->{tokenSource}, 'base#tokens', 'TokenSource'); |
| 58 | like($data->{foundries}, qr!mdparser/dependency!, 'foundries'); |
| 59 | like($data->{layerInfos}, qr!mdp/d=rels!, 'foundries'); |
| 60 | |
| 61 | my $stream = $data->{stream}; |
| 62 | |
Akron | a86d94a | 2016-07-06 14:23:12 +0200 | [diff] [blame] | 63 | is($stream->[0]->[0], '-:tokens$<i>3555', 'Token count'); |
| 64 | |
Akron | 918ce42 | 2017-06-16 20:28:43 +0200 | [diff] [blame] | 65 | is($stream->[-1]->[-1], 's:978-3-89487-607-4', 'Last token'); |
| 66 | |
Akron | a86d94a | 2016-07-06 14:23:12 +0200 | [diff] [blame] | 67 | # Term-to-term |
| 68 | is($stream->[0]->[1], '<:mdp/d:NMOD$<b>32<i>5', 'Term-to-Term'); |
| 69 | is($stream->[5]->[0], '>:mdp/d:NMOD$<b>32<i>0', 'Term-to-Term'); |
| 70 | |
| 71 | # Element-to-term |
| 72 | is($stream->[0]->[8], '<:mdp/d:ROOT$<b>34<i>0<i>317<i>40<i>0', 'Element-to-Term'); |
| 73 | is($stream->[0]->[10], '>:mdp/d:ROOT$<b>33<i>0<i>317<i>0<i>40', 'Term-to-Element'); |
| 74 | |
| 75 | |
Akron | dec4312 | 2020-03-03 11:22:25 +0100 | [diff] [blame] | 76 | is($stream->[-1]->[0], '>:mdp/d:ROOT$<b>33<i>26130<i>26153<i>3553<i>3555', 'Term-to-Element'); |
| 77 | is($stream->[3553]->[1], '<:mdp/d:ROOT$<b>34<i>26130<i>26153<i>3555<i>3553', 'Element-to-Term'); |
Akron | 05ba547 | 2016-07-05 21:12:26 +0200 | [diff] [blame] | 78 | |
| 79 | done_testing; |
| 80 | __END__ |