blob: 8ae04a6d2166dc47646a1524401c3eb23d619c83 [file] [log] [blame]
Akrond3a0bad2016-02-26 14:07:58 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
Akronfab17d32020-07-31 14:38:29 +02007if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
Akrond3a0bad2016-02-26 14:07:58 +010011use Benchmark qw/:hireswallclock/;
12
13my $t = Benchmark->new;
14
15use utf8;
16use lib 'lib', '../lib';
17
18use File::Basename 'dirname';
19use File::Spec::Functions 'catdir';
20
21use_ok('KorAP::XML::Krill');
22
Akron414ec952020-08-03 15:48:43 +020023my $path = catdir(dirname(__FILE__), 'corpus','WPD','00001');
Akrond3a0bad2016-02-26 14:07:58 +010024
25ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
26ok($doc->parse, 'Parse document');
27
Akron1cd5b872016-03-22 00:23:46 +010028is($doc->text_sigle, 'WPD/AAA/00001', 'Correct text sigle');
29is($doc->doc_sigle, 'WPD/AAA', 'Correct document sigle');
Akrond3a0bad2016-02-26 14:07:58 +010030is($doc->corpus_sigle, 'WPD', 'Correct corpus sigle');
31
Akron35db6e32016-03-17 22:42:22 +010032my $meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +010033is($meta->{T_title}, 'A', 'Title');
34is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
35is($meta->{D_pub_date}, '20050328', 'Creation Date');
Akron7e2eb882017-01-18 17:28:07 +010036SKIP: {
37 skip 'Failure because corpus is no longer supported', 1;
Akron5eb3aa02019-01-25 18:30:47 +010038 ok(!$meta->{T_sub_title}, 'SubTitle');
Akron7e2eb882017-01-18 17:28:07 +010039};
Akron5eb3aa02019-01-25 18:30:47 +010040is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'Author');
Akrond3a0bad2016-02-26 14:07:58 +010041
Akron5eb3aa02019-01-25 18:30:47 +010042ok(!$meta->{T_doc_title}, 'Correct Doc title');
43ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
44ok(!$meta->{T_doc_author}, 'Correct Doc author');
45ok(!$meta->{A_doc_editor}, 'Correct Doc editor');
Akrond3a0bad2016-02-26 14:07:58 +010046
Akron5eb3aa02019-01-25 18:30:47 +010047ok(!$meta->{T_corpus_title}, 'Correct Corpus title');
48ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
Akrond3a0bad2016-02-26 14:07:58 +010049
Akron6bf3cc92019-02-07 12:11:20 +010050# This link is broken, but that's due to the data
Akron8ad06c42022-01-11 17:07:49 +010051is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org', 'No link');
Akron6bf3cc92019-02-07 12:11:20 +010052
Akrond3a0bad2016-02-26 14:07:58 +010053# Tokenization
54use_ok('KorAP::XML::Tokenizer');
55
56my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
57
58# Get tokenization
59my $tokens = KorAP::XML::Tokenizer->new(
60 path => $doc->path,
61 doc => $doc,
62 foundry => $token_base_foundry,
63 layer => $token_base_layer,
64 name => 'tokens'
65);
66ok($tokens, 'Token Object is fine');
67ok($tokens->parse, 'Token parsing is fine');
68
69my $output = $tokens->to_data;
70
71is(substr($output->{data}->{text}, 0, 100), 'A bzw. a ist der erste Buchstabe des lateinischen Alphabets und ein Vokal. Der Buchstabe A hat in de', 'Primary Data');
72is($output->{data}->{name}, 'tokens', 'tokenName');
73is($output->{data}->{tokenSource}, 'opennlp#tokens', 'tokenSource');
74
75is($output->{version}, '0.03', 'version');
76is($output->{data}->{foundries}, '', 'Foundries');
77is($output->{data}->{layerInfos}, '', 'layerInfos');
78is($output->{data}->{stream}->[0]->[4], 's:A', 'data');
79
80$tokens->add('Mate', 'Dependency');
81
82my $stream = $tokens->to_data->{data}->{stream};
83
Akron75ba57d2016-03-07 23:36:27 +010084# This is not a goot relation example
Akronb62d92a2020-03-01 16:32:00 +010085 is($stream->[79]->[0],
86 '>:mate/d:CJ$<b>32<i>68',
87 'term to term');
88 is($stream->[79]->[1], '<:mate/d:PD$<b>32<i>81', 'term to term');
89
90
91# These are no longer aligned
92# is($stream->[77]->[0],
93# '<:mate/d:--$<b>34<i>498<i>499<i>78<i>78',
94# 'element to term');
95# is($stream->[78]->[0], '>:mate/d:--$<b>33<i>498<i>499<i>77<i>78', 'term to element');
Akrond3a0bad2016-02-26 14:07:58 +010096
Akronaf0ae3f2016-07-14 16:21:50 +020097$tokens->add('Base', 'Sentences');
98
99$stream = $tokens->to_data->{data}->{stream};
100
101is($stream->[0]->[2], '<>:base/s:s$<b>64<i>0<i>74<i>13<b>2', 'Text starts with sentence');
Akrond3a0bad2016-02-26 14:07:58 +0100102
Akron6f9fef52016-11-03 17:06:40 +0100103
104# Problematic document
Akron414ec952020-08-03 15:48:43 +0200105$path = catdir(dirname(__FILE__), 'corpus','WPD15','W28','65631');
Akron6f9fef52016-11-03 17:06:40 +0100106ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
107ok($doc->parse, 'Parse document');
108
109is($doc->text_sigle, 'WPD15/W28/65631', 'Correct text sigle');
110is($doc->doc_sigle, 'WPD15/W28', 'Correct document sigle');
111is($doc->corpus_sigle, 'WPD15', 'Correct corpus sigle');
112
Akron6bf3cc92019-02-07 12:11:20 +0100113$meta = $doc->meta;
Akron8ad06c42022-01-11 17:07:49 +0100114is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FWolfgang_Krebs_%28Schauspieler%29', 'link');
Akron6bf3cc92019-02-07 12:11:20 +0100115
Akron6f9fef52016-11-03 17:06:40 +0100116# Get tokenization
117$tokens = KorAP::XML::Tokenizer->new(
118 path => $doc->path,
119 doc => $doc,
120 foundry => 'Base',
121 layer => 'tokens_aggr',
122 name => 'tokens'
123);
124ok($tokens, 'Token Object is fine');
125ok($tokens->parse, 'Token parsing is fine');
126
127is($tokens->foundry, 'Base', 'Foundry');
128is($tokens->layer, 'tokens_aggr', 'Layer');
129
Akron6f9fef52016-11-03 17:06:40 +0100130ok($tokens->add('CoreNLP', 'Constituency'), 'Add Structure');
131
132$output = $tokens->to_data;
133
134is($output->{data}->{foundries}, 'corenlp corenlp/constituency', 'Foundries');
135is($output->{data}->{layerInfos}, 'corenlp/c=spans', 'layerInfos');
Akron5eb3aa02019-01-25 18:30:47 +0100136is($doc->meta->{A_editor}, 'wikipedia.org', 'Editor');
Akron6f9fef52016-11-03 17:06:40 +0100137
Akron56dfb312017-11-23 18:37:35 +0100138
139# Check offset problem
Akron414ec952020-08-03 15:48:43 +0200140$path = catdir(dirname(__FILE__), 'corpus','WPD15','U43','34816');
Akron56dfb312017-11-23 18:37:35 +0100141ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
142ok($doc->parse, 'Parse document');
143
144is($doc->text_sigle, 'WPD15/U43/34816', 'Correct text sigle');
145
Akron6bf3cc92019-02-07 12:11:20 +0100146$meta = $doc->meta;
Akron8ad06c42022-01-11 17:07:49 +0100147is($meta->{A_externalLink}, 'data:application/x.korap-link;title=Wikipedia,http%3A%2F%2Fde.wikipedia.org%2Fwiki%2FUniversit%E4tsbibliothek_Augsburg');
Akron6bf3cc92019-02-07 12:11:20 +0100148
Akron56dfb312017-11-23 18:37:35 +0100149# Tokenization
150use_ok('KorAP::XML::Tokenizer');
151
152$token_base_foundry = 'Base';
153
154# Get tokenization
155$tokens = KorAP::XML::Tokenizer->new(
156 path => $doc->path,
157 doc => $doc,
158 foundry => $token_base_foundry,
159 layer => $token_base_layer,
160 name => 'tokens'
161);
162ok($tokens, 'Token Object is fine');
163ok($tokens->parse, 'Token parsing is fine');
164
165$output = $tokens->to_data;
166$stream = $tokens->to_data->{data}->{stream};
167
168is($stream->[420]->[-1], 's:online', 'online');
169is($stream->[421]->[-1], 's:verfügbar', 'verfügbar');
170
Akrond3a0bad2016-02-26 14:07:58 +0100171done_testing;
172__END__
173
174
175
176