blob: 718a0ba433f138945bd54906d7dec0b15d1975ff [file] [log] [blame]
Akrondc898d82016-02-28 23:49:19 +01001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions 'catdir';
6use Data::Dumper;
7use KorAP::XML::Tokenizer;
8use KorAP::XML::Krill;
9use utf8;
10
11my $path = catdir(dirname(__FILE__), 'CMC-TSK', '2014-09', 3401);
12
13ok(my $doc = KorAP::XML::Krill->new(
14 path => $path . '/'
15), 'Create Document');
16
Akron35db6e32016-03-17 22:42:22 +010017ok($doc->parse('Sgbr'), 'Parse document');
Akrondc898d82016-02-28 23:49:19 +010018
19ok(my $tokens = KorAP::XML::Tokenizer->new(
20 path => $doc->path,
21 doc => $doc,
22 foundry => 'Sgbr',
23 layer => 'Lemma',
24 name => 'tokens'
25), 'Create tokens based on lemmata');
26
27ok($tokens->parse, 'Parse tokenization based on lemmata');
28
29ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
30
31my $stream = $tokens->to_data->{data}->{stream};
32
33is($stream->[0]->[0], '-:base/sentences$<i>1');
34is($stream->[0]->[1], '-:tokens$<i>15');
35is($stream->[0]->[2], '<>:base/s:t$<b>64<i>0<i>115<i>14<b>0');
36is($stream->[0]->[3], '<>:base/s:s$<b>64<i>16<i>114<i>14<b>2');
37is($stream->[0]->[4], '_0$<i>17<i>18');
38
39done_testing;