blob: 3221dba83585d7dbba63797880bdbbc2da44b475 [file] [log] [blame]
Akron3d47ca42016-01-25 20:55:55 +01001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions 'catdir';
6use Data::Dumper;
Akrone4c2e412016-01-28 15:10:50 +01007use KorAP::XML::Tokenizer;
8use KorAP::XML::Krill;
Akron3d47ca42016-01-25 20:55:55 +01009use utf8;
10
11my $path = catdir(dirname(__FILE__), 'TEST', 'BSP', 1);
12
Akrone4c2e412016-01-28 15:10:50 +010013ok(my $doc = KorAP::XML::Krill->new(
Akron3d47ca42016-01-25 20:55:55 +010014 path => $path . '/'
15), 'Create Document');
16
17ok($doc->parse, 'Parse document');
18
Akrone4c2e412016-01-28 15:10:50 +010019ok(my $tokens = KorAP::XML::Tokenizer->new(
Akron3d47ca42016-01-25 20:55:55 +010020 path => $doc->path,
21 doc => $doc,
22 foundry => 'Sgbr',
23 layer => 'Lemma',
24 name => 'tokens'
25), 'Create tokens based on lemmata');
26
27ok($tokens->parse, 'Parse tokenization based on lemmata');
28
Akron93d620e2016-02-05 19:40:05 +010029ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
Akron3d47ca42016-01-25 20:55:55 +010030
31my $data = $tokens->to_data->{data};
32
33my $stream = $data->{stream};
34is($stream->[0]->[0], '-:tokens$<i>51', 'Token number');
Akron2d83a5a2016-02-26 00:21:16 +010035is($stream->[0]->[1], '<>:base/s:t$<b>64<i>0<i>365<i>50<b>0', 'Text Boundary');
36is($stream->[0]->[2], '_0$<i>0<i>18', 'Position');
37is($stream->[0]->[3], 'i:sommerüberraschung', 'First term');
38is($stream->[0]->[4], 's:Sommerüberraschung', 'First term');
39is($stream->[0]->[5], 'sgbr/l:Sommerüberraschung', 'First term');
40ok(!defined $stream->[0]->[6], 'First term');
Akron3d47ca42016-01-25 20:55:55 +010041
42is($stream->[1]->[0], '_1$<i>19<i>21', 'Position');
43is($stream->[1]->[1], 'i:es', 'Second term');
44is($stream->[1]->[2], 's:Es', 'Second term');
45is($stream->[1]->[3], 'sgbr/l:es', 'Second term');
46is($stream->[1]->[4], 'sgbr/lv:er', 'Second term');
47is($stream->[1]->[5], 'sgbr/lv:sie', 'Second term');
48
49is($stream->[16]->[0], '_16$<i>107<i>115', 'Position');
50is($stream->[16]->[1], 'i:guenther', '16th term');
51is($stream->[16]->[2], 's:Guenther', '16th term');
52is($stream->[16]->[3], 'sgbr/l:Günther', '16th term');
53is($stream->[16]->[4], 'sgbr/lv:Günter', '16th term');
54
55is($stream->[-1]->[0], '_50$<i>359<i>364', 'Position');
56is($stream->[-1]->[1], 'i:kevin', 'Last term');
57is($stream->[-1]->[2], 's:Kevin', 'Last term');
58is($stream->[-1]->[3], 'sgbr/l:Kevin', 'Last term');
59
Akron8c84aa52016-02-13 21:26:54 +010060
61# Real data 1
62$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '2843');
63
64ok($doc = KorAP::XML::Krill->new(
65 path => $path . '/'
66), 'Create Document');
67
68ok($doc->parse, 'Parse document');
69
70ok($tokens = KorAP::XML::Tokenizer->new(
71 path => $doc->path,
72 doc => $doc,
73 foundry => 'Sgbr',
74 layer => 'Lemma',
75 name => 'tokens'
76), 'Create tokens based on lemmata');
77
78ok($tokens->parse, 'Parse tokenization based on lemmata');
79
80ok(!$tokens->add('Sgbr', 'Lemma'), 'Add Structure impossible - no token data');
81
82
83# Real data 2
84$path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', '3401');
85
86ok($doc = KorAP::XML::Krill->new(
87 path => $path . '/'
88), 'Create Document');
89
90ok($doc->parse, 'Parse document');
91
92ok($tokens = KorAP::XML::Tokenizer->new(
93 path => $doc->path,
94 doc => $doc,
95 foundry => 'Sgbr',
96 layer => 'Lemma',
97 name => 'tokens'
98), 'Create tokens based on lemmata');
99
100ok($tokens->parse, 'Parse tokenization based on lemmata');
101
102ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
103
Akron3d47ca42016-01-25 20:55:55 +0100104done_testing;
Akron8c84aa52016-02-13 21:26:54 +0100105
106__END__