blob: 09987112f5e625cc00c244ca8276d7a819e6ce91 [file] [log] [blame]
Akron4c679192018-01-16 17:41:49 +01001use strict;
2use warnings;
3use Test::More;
4use Data::Dumper;
5use JSON::XS;
6
Akronfab17d32020-07-31 14:38:29 +02007if ($ENV{SKIP_REAL}) {
8 plan skip_all => 'Skip real tests';
9};
10
Akron4c679192018-01-16 17:41:49 +010011use Benchmark qw/:hireswallclock/;
12
13my $t = Benchmark->new;
14
15use utf8;
16use lib 'lib', '../lib';
17
18use File::Basename 'dirname';
19use File::Spec::Functions 'catdir';
20
21use_ok('KorAP::XML::Krill');
22
23# This will Check LWC annotations
24
25# New
26
27my $path = catdir(dirname(__FILE__), '../corpus/WPD17/000/22053');
28
29ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
30ok($doc->parse, 'Parse document');
31
32is($doc->text_sigle, 'WPD17/000/22053', 'Correct text sigle');
33is($doc->doc_sigle, 'WPD17/000', 'Correct document sigle');
34is($doc->corpus_sigle, 'WPD17', 'Correct corpus sigle');
35
36my $meta = $doc->meta;
Akron5eb3aa02019-01-25 18:30:47 +010037is($meta->{T_title}, '0er', 'Title');
38is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'PubPlace');
39is($meta->{D_pub_date}, '20170701', 'Creation Date');
40ok(!$meta->{T_sub_title}, 'SubTitle');
41is($meta->{T_author}, 'Rogi.Official, u.a.', 'Author');
Akron4c679192018-01-16 17:41:49 +010042
Akron5eb3aa02019-01-25 18:30:47 +010043is($meta->{A_publisher}, 'Wikipedia', 'Publisher');
44is($meta->{A_editor},'wikipedia.org', 'Editor');
Akron4c679192018-01-16 17:41:49 +010045ok(!$meta->{translator}, 'Translator');
Akron5eb3aa02019-01-25 18:30:47 +010046is($meta->{S_text_type}, 'Enzyklopädie', 'Correct Text Type');
47is($meta->{S_text_type_art}, 'Enzyklopädie-Artikel', 'Correct Text Type Art');
48ok(!$meta->{S_text_type_ref}, 'Correct Text Type Ref');
49ok(!$meta->{S_text_column}, 'Correct Text Column');
50ok(!$meta->{S_text_domain}, 'Correct Text Domain');
51is($meta->{D_creation_date},'20150511', 'Creation Date');
Akron4c679192018-01-16 17:41:49 +010052
53ok(!$meta->{pages}, 'Pages');
Akron5eb3aa02019-01-25 18:30:47 +010054ok(!$meta->{A_file_edition_statement}, 'File Ed Statement');
55ok(!$meta->{A_bibl_edition_statement}, 'Bibl Ed Statement');
56is($meta->{A_reference}, '0er, In: Wikipedia - URL:http://de.wikipedia.org/wiki/0er: Wikipedia, 2017', 'Reference');
57is($meta->{S_language}, 'de', 'Language');
Akron4c679192018-01-16 17:41:49 +010058
Akron5eb3aa02019-01-25 18:30:47 +010059is($meta->{T_corpus_title}, 'Wikipedia', 'Correct Corpus title');
60ok(!$meta->{T_corpus_sub_title}, 'Correct Corpus Sub title');
61ok(!$meta->{T_corpus_author}, 'Correct Corpus author');
62is($meta->{A_corpus_editor}, 'wikipedia.org', 'Correct Corpus editor');
Akron4c679192018-01-16 17:41:49 +010063
Akron5eb3aa02019-01-25 18:30:47 +010064is($meta->{T_doc_title}, 'Wikipedia, Artikel mit Anfangszahl 0, Teil 00', 'Correct Doc title');
65ok(!$meta->{T_doc_sub_title}, 'Correct Doc Sub title');
66ok(!$meta->{T_doc_author}, 'Correct Doc author');
67ok(!$meta->{A_doc_editor}, 'Correct Doc editor');
Akron4c679192018-01-16 17:41:49 +010068
69# Tokenization
70use_ok('KorAP::XML::Tokenizer');
71
72my ($token_base_foundry, $token_base_layer) = (qw/Base Tokens/);
73
74# Get tokenization
75my $tokens = KorAP::XML::Tokenizer->new(
76 path => $doc->path,
77 doc => $doc,
78 foundry => $token_base_foundry,
79 layer => $token_base_layer,
80 name => 'tokens'
81);
82ok($tokens, 'Token Object is fine');
83ok($tokens->parse, 'Token parsing is fine');
84
85my $output = decode_json( $tokens->to_json );
86
87## Base
88$tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs');
89
90# LWC
91ok($tokens->add('LWC', 'Dependency'), 'Add LWC dependency annotations');
92
93$output = $tokens->to_data;
94
95is($output->{data}->{foundries},
96 'dereko dereko/structure dereko/structure/base_sentences_paragraphs lwc lwc/dependency',
97 'Foundries');
98
99is($output->{data}->{layerInfos}, 'dereko/s=spans lwc/d=rels', 'layerInfos');
100
101my $token = join('||', @{$output->{data}->{stream}->[7]});
102
Akron6727b212018-01-17 13:50:09 +0100103like($token, qr!>:lwc/d:SVP\$<b>32<i>4!, 'data');
Akron4c679192018-01-16 17:41:49 +0100104like($token, qr!i:statt!, 'data');
105
106$token = join('||', @{$output->{data}->{stream}->[9]});
107
Akron6727b212018-01-17 13:50:09 +0100108like($token, qr!>:lwc/d:--\$<b>33<i>64<i>76<i>8<i>11!, 'data');
Akron4c679192018-01-16 17:41:49 +0100109like($token, qr!s:Januar!, 'data');
110
111
112$path = catdir(dirname(__FILE__), '../corpus/WPD17/060/18486');
113
114ok($doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
115ok($doc->parse, 'Parse document');
116
117$meta = $doc->meta;
118
Akron5eb3aa02019-01-25 18:30:47 +0100119is($meta->{T_doc_title}, 'Wikipedia, Artikel mit Anfangszahl 0, Teil 60', 'No doc title');
Akron4c679192018-01-16 17:41:49 +0100120ok(!exists $meta->{translator}, 'No translator');
121
Akron5eb3aa02019-01-25 18:30:47 +0100122is($meta->{K_text_class}->[0], 'staat-gesellschaft', 'text class');
123is($meta->{K_text_class}->[1], 'verbrechen', 'text class');
Akron4c679192018-01-16 17:41:49 +0100124
125
126
127
128# Get tokenization
129$tokens = KorAP::XML::Tokenizer->new(
130 path => $doc->path,
131 doc => $doc,
132 foundry => $token_base_foundry,
133 layer => $token_base_layer,
134 name => 'tokens'
135);
136ok($tokens, 'Token Object is fine');
137ok($tokens->parse, 'Token parsing is fine');
138
139## Base
140$tokens->add('DeReKo', 'Structure', 'base_sentences_paragraphs');
141
142# LWC
143ok($tokens->add('LWC', 'Dependency'), 'Add LWC dependency annotations');
144
145$output = decode_json( $tokens->to_json );
146
147$token = join('||', @{$output->{data}->{stream}->[2]});
148
Akron6727b212018-01-17 13:50:09 +0100149like($token, qr!>:lwc/d:SVP\$<b>32<i>1!, 'data');
Akron4c679192018-01-16 17:41:49 +0100150like($token, qr!s:für!, 'data');
151
152
153done_testing;
154__END__