blob: 3a89b180db6789926f07813bd2b7441ce6c6dcb7 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
6use File::Temp 'tempfile';
7
8use FindBin;
9BEGIN {
10 unshift @INC, "$FindBin::Bin/../lib";
11};
12
13require_ok('KorAP::XML::TEI::Tokenization');
14
15# Test aggressive
16my $aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte Mann");
17is_deeply($aggr, [0,3,4,8,9, 13]);
18
19# Test conservative
20my $cons = KorAP::XML::TEI::Tokenization::conservative("Der alte Mann");
21is_deeply($cons, [0,3,4,8,9,13]);
22
23# Test data
24my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
25my $data = '';
26
27ok(open(FH, '<' . $dataf), 'Open file');
28while (!eof(FH)) {
29 $data .= <FH>
30};
31close(FH);
32
33is(137166, length($data));
34
35$aggr = KorAP::XML::TEI::Tokenization::aggressive($data);
36is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
37is(47242, scalar(@$aggr));
38
39$cons = KorAP::XML::TEI::Tokenization::conservative($data);
40is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
41is(43068, scalar(@$cons));
42
43done_testing;