blob: f7f29a262effd3f126ffb79979d1efafa2730358 [file] [log] [blame]
Akron05ba5472016-07-05 21:12:26 +02001#!/usr/bin/env perl
2use strict;
3use warnings;
4use utf8;
Akron05ba5472016-07-05 21:12:26 +02005use Data::Dumper;
Akron05ba5472016-07-05 21:12:26 +02006use File::Basename 'dirname';
7use File::Spec::Functions qw/catdir catfile/;
8use Test::More;
9use Scalar::Util qw/weaken/;
10use Data::Dumper;
11use lib 't/annotation';
12use File::Temp qw/tempdir/;
13
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020014use KorAP::XML::Archive;
Akron05ba5472016-07-05 21:12:26 +020015
16my $name = 'wpd15-single';
17my @path = (dirname(__FILE__), '..', 'corpus','archives');
18
19my $file = catfile(@path, $name . '.zip');
Nils Diewaldb3e9ccd2016-10-24 15:16:52 +020020my $archive = KorAP::XML::Archive->new($file);
21
22unless ($archive->test_unzip) {
23 plan skip_all => 'unzip not found';
24};
25
26use_ok('KorAP::XML::Annotation::MDParser::Dependency');
27use_ok('KorAP::XML::Krill');
28use_ok('KorAP::XML::Tokenizer');
29
Akron05ba5472016-07-05 21:12:26 +020030
31ok($archive->attach('#' . catfile(@path, $name . '.mdparser.zip')), 'Attach mdparser archive');
32
33my $dir = tempdir();
34
35my $f_path = 'WPD15/A00/00081';
Akron955b75b2019-02-21 14:28:41 +010036$archive->extract_sigle([$f_path], $dir);
Akron05ba5472016-07-05 21:12:26 +020037
38ok(my $doc = KorAP::XML::Krill->new( path => $dir . '/' . $f_path));
39
40ok($doc->parse, 'Krill parser works');
41
42my $tokens = KorAP::XML::Tokenizer->new(
43 path => $doc->path,
44 doc => $doc,
45 foundry => 'Base',
46 layer => 'Tokens',
47 name => 'tokens'
48) or return;
49
50$tokens->parse or return;
51
52ok($tokens->add('MDParser', 'Dependency'), 'Add Dependency');
53
54my $data = $tokens->to_data->{data};
55
56
57is($data->{tokenSource}, 'base#tokens', 'TokenSource');
58like($data->{foundries}, qr!mdparser/dependency!, 'foundries');
59like($data->{layerInfos}, qr!mdp/d=rels!, 'foundries');
60
61my $stream = $data->{stream};
62
Akrona86d94a2016-07-06 14:23:12 +020063is($stream->[0]->[0], '-:tokens$<i>3555', 'Token count');
64
Akron918ce422017-06-16 20:28:43 +020065is($stream->[-1]->[-1], 's:978-3-89487-607-4', 'Last token');
66
Akrona86d94a2016-07-06 14:23:12 +020067# Term-to-term
68is($stream->[0]->[1], '<:mdp/d:NMOD$<b>32<i>5', 'Term-to-Term');
69is($stream->[5]->[0], '>:mdp/d:NMOD$<b>32<i>0', 'Term-to-Term');
70
71# Element-to-term
72is($stream->[0]->[8], '<:mdp/d:ROOT$<b>34<i>0<i>317<i>40<i>0', 'Element-to-Term');
73is($stream->[0]->[10], '>:mdp/d:ROOT$<b>33<i>0<i>317<i>0<i>40', 'Term-to-Element');
74
75
Akrondec43122020-03-03 11:22:25 +010076is($stream->[-1]->[0], '>:mdp/d:ROOT$<b>33<i>26130<i>26153<i>3553<i>3555', 'Term-to-Element');
77is($stream->[3553]->[1], '<:mdp/d:ROOT$<b>34<i>26130<i>26153<i>3555<i>3553', 'Element-to-Term');
Akron05ba5472016-07-05 21:12:26 +020078
79done_testing;
80__END__