blob: cdd46e82784922ee79d57a2246f83aa010406142 [file] [log] [blame]
Akron88d063a2022-03-21 15:10:01 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
7if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
11use utf8;
12use lib 'lib', '../lib';
13
14use File::Basename 'dirname';
15use File::Spec::Functions 'catdir';
16
17use_ok('KorAP::XML::Krill');
18
19my $path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KOT');
20
21ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
22ok($doc->parse, 'Parse document');
23
24is($doc->text_sigle, 'NKJP/NKJP/KOT', 'Correct text sigle');
25is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
26is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
27
28my $meta = $doc->meta;
29
30is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "Kot"', 'Title');
31ok(!$meta->{T_sub_title}, 'SubTitle');
32ok(!$meta->{T_author}, 'Author');
33ok(!$meta->{A_editor}, 'Editor');
34ok(!$meta->{S_pub_place}, 'PubPlace');
35ok(!$meta->{A_publisher}, 'Publisher');
36
37ok(!$meta->{S_text_type}, 'No Text Type');
38ok(!$meta->{S_text_type_art}, 'No Text Type Art');
39ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
40ok(!$meta->{S_text_domain}, 'No Text Domain');
41ok(!$meta->{S_text_column}, 'No Text Column');
42
43
44# Tokenization
45use_ok('KorAP::XML::Tokenizer');
46
47my ($token_base_foundry, $token_base_layer) = (qw/nkjp Morpho/);
48
49# Get tokenization
50my $tokens = KorAP::XML::Tokenizer->new(
51 path => $doc->path,
52 doc => $doc,
53 foundry => $token_base_foundry,
54 layer => $token_base_layer,
55 name => 'tokens'
56);
57ok($tokens, 'Token Object is fine');
58ok($tokens->parse, 'Token parsing is fine');
59
60my $output = decode_json( $tokens->to_json );
61
62is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>43', 't');
63is($output->{data}->{stream}->[0]->[3], 'i:nie', 't');
64is($output->{data}->{stream}->[1]->[2], 's:zdążyła', 't');
65
66## Base
67ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
68ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
69
70$output = $tokens->to_data;
71
72is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
73
74is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
75
76my $token = join('||', @{$output->{data}->{stream}->[7]});
77
78like($token, qr!<>:dereko\/s:seg\$<b>64!);
79like($token, qr!<>:dereko\/s:seg\$<b>64!);
80like($token, qr!iadu!);
81like($token, qr!nkjp\/lad!);
82like($token, qr!nkjp\/m:sg:gen:m3!);
83like($token, qr!nkjp\/p:subst!);
84like($token, qr!sadu!);
85
Akronafb98562022-06-08 14:45:09 +020086
87# KolakowskiOco
88$path = catdir(dirname(__FILE__), 'corpus','NKJP','NKJP','KolakowskiOco');
89
90ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
91ok($doc->parse, 'Parse document');
92
93is($doc->text_sigle, 'NKJP/NKJP/KolakowskiOco', 'Correct text sigle');
94is($doc->doc_sigle, 'NKJP/NKJP', 'Correct document sigle');
95is($doc->corpus_sigle, 'NKJP', 'Correct corpus sigle');
96
97$meta = $doc->meta;
98
99is($meta->{T_title}, 'TEI P5 encoded version of sample(s) of "O co nas pytają wielcy filozofowie. Seria 3 "', 'Title');
100ok(!$meta->{T_sub_title}, 'SubTitle');
101ok(!$meta->{T_author}, 'Author');
102ok(!$meta->{A_editor}, 'Editor');
103ok(!$meta->{S_pub_place}, 'PubPlace');
104ok(!$meta->{A_publisher}, 'Publisher');
105
106ok(!$meta->{S_text_type}, 'No Text Type');
107ok(!$meta->{S_text_type_art}, 'No Text Type Art');
108ok(!$meta->{S_text_type_ref}, 'No Text Type Ref');
109ok(!$meta->{S_text_domain}, 'No Text Domain');
110ok(!$meta->{S_text_column}, 'No Text Column');
111
112# Get tokenization
113$tokens = KorAP::XML::Tokenizer->new(
114 path => $doc->path,
115 doc => $doc,
116 foundry => $token_base_foundry,
117 layer => $token_base_layer,
118 name => 'tokens'
119);
120ok($tokens, 'Token Object is fine');
121ok($tokens->parse, 'Token parsing is fine');
122
123$output = decode_json( $tokens->to_json );
124
125is($output->{data}->{stream}->[0]->[0], '-:tokens$<i>117', 't');
126is($output->{data}->{stream}->[0]->[3], 'i:czy', 't');
127is($output->{data}->{stream}->[1]->[2], 's:zdarza', 't');
128
129## Base
130ok($tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs'));
131ok($tokens->add('NKJP', 'Morpho'), 'Add Gingko');
132
133$output = $tokens->to_data;
134
135is($output->{data}->{foundries}, 'dereko dereko/structure dereko/structure/base_sentences_paragraphs nkjp nkjp/morpho', 'Foundries');
136
137is($output->{data}->{layerInfos}, 'dereko/s=spans nkjp/l=tokens nkjp/m=tokens nkjp/p=tokens', 'layerInfos');
138
139$token = join('||', @{$output->{data}->{stream}->[5]});
140
141like($token, qr!<>:dereko/s:seg\$<b>64<i>23<i>28<i>6<b>4<s>1!);
142like($token, qr!_5\$<i>23<i>28!);
143like($token, qr!i:takie!);
144like($token, qr!nkjp/l:taki!);
145like($token, qr!nkjp/m:sg:nom:n:pos!);
146like($token, qr!nkjp/p:adj!);
147like($token, qr!s:takie!);
148
Akron88d063a2022-03-21 15:10:01 +0100149done_testing;
150__END__
151