blob: 54f94e46c7c20100e17d287a2f0fa9c1b9a9ee49 [file] [log] [blame]
Akron414ec952020-08-03 15:48:43 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions 'catdir';
6use Data::Dumper;
7use KorAP::XML::Tokenizer;
8use KorAP::XML::Krill;
9use utf8;
10
11if ($ENV{SKIP_REAL}) {
12 plan skip_all => 'Skip real tests';
13};
14
15my $path = catdir(dirname(__FILE__), 'TEST', 'BSP', 1);
16
17ok(my $doc = KorAP::XML::Krill->new(
18 path => $path . '/'
19), 'Create Document');
20
21ok($doc->parse, 'Parse document');
22
23ok(my $tokens = KorAP::XML::Tokenizer->new(
24 path => $doc->path,
25 doc => $doc,
26 foundry => 'Sgbr',
27 layer => 'Lemma',
28 name => 'tokens'
29), 'Create tokens based on lemmata');
30
31ok($tokens->parse, 'Parse tokenization based on lemmata');
32
33ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
34
35my $data = $tokens->to_data->{data};
36
37my $stream = $data->{stream};
38is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
39is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>51<b>0', 'Text Boundary');
40is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
41is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
42is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
43is($stream->[0]->[5], 'sgbr/l:Sommerüberraschung', 'First term');
44ok(!defined $stream->[0]->[6], 'First term');
45
46is($stream->[1]->[0], '_1$<i>19<i>21', 'Position');
47is($stream->[1]->[1], 'i:es', 'Second term');
48is($stream->[1]->[2], 's:Es', 'Second term');
49is($stream->[1]->[3], 'sgbr/l:es', 'Second term');
50is($stream->[1]->[4], 'sgbr/lv:er', 'Second term');
51is($stream->[1]->[5], 'sgbr/lv:sie', 'Second term');
52
53is($stream->[16]->[0], '_16$<i>107<i>115', 'Position');
54is($stream->[16]->[1], 'i:guenther', '16th term');
55is($stream->[16]->[2], 's:Guenther', '16th term');
56is($stream->[16]->[3], 'sgbr/l:Günther', '16th term');
57is($stream->[16]->[4], 'sgbr/lv:Günter', '16th term');
58
59is($stream->[-1]->[0], '_50$<i>359<i>364', 'Position');
60is($stream->[-1]->[1], 'i:kevin', 'Last term');
61is($stream->[-1]->[2], 's:Kevin', 'Last term');
62is($stream->[-1]->[3], 'sgbr/l:Kevin', 'Last term');
63
64
65# Real data 1
66$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '2843');
67
68ok($doc = KorAP::XML::Krill->new(
69 path => $path . '/'
70), 'Create Document');
71
72ok($doc->parse, 'Parse document');
73
74ok($tokens = KorAP::XML::Tokenizer->new(
75 path => $doc->path,
76 doc => $doc,
77 foundry => 'Sgbr',
78 layer => 'Lemma',
79 name => 'tokens'
80), 'Create tokens based on lemmata');
81
82ok($tokens->parse, 'Parse tokenization based on lemmata');
83
84ok(!$tokens->add('Sgbr', 'Lemma'), 'Add Structure impossible - no token data');
85
86
87# Real data 2
88$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '3401');
89
90ok($doc = KorAP::XML::Krill->new(
91 path => $path . '/'
92), 'Create Document');
93
94ok($doc->parse, 'Parse document');
95
96ok($tokens = KorAP::XML::Tokenizer->new(
97 path => $doc->path,
98 doc => $doc,
99 foundry => 'Sgbr',
100 layer => 'Lemma',
101 name => 'tokens'
102), 'Create tokens based on lemmata');
103
104ok($tokens->parse, 'Parse tokenization based on lemmata');
105
106ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
107
108done_testing;
109
110__END__