blob: a8f8935ee85454de15a681b065dfd956982ea90e [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
Akron510a88c2020-07-07 10:16:50 +02005use Data::Dumper;
Akroneac374d2020-07-07 09:00:44 +02006use File::Spec::Functions qw/catfile/;
7use File::Temp 'tempfile';
8
9use FindBin;
10BEGIN {
11 unshift @INC, "$FindBin::Bin/../lib";
12};
13
14require_ok('KorAP::XML::TEI::Tokenization');
15
16# Test aggressive
17my $aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020018is_deeply($aggr, [0,3,4,8,9,13]);
19
20$aggr = KorAP::XML::TEI::Tokenization::aggressive("Der alte bzw. der grau-melierte Mann");
21is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020022
23# Test conservative
24my $cons = KorAP::XML::TEI::Tokenization::conservative("Der alte Mann");
25is_deeply($cons, [0,3,4,8,9,13]);
26
Akron510a88c2020-07-07 10:16:50 +020027$cons = KorAP::XML::TEI::Tokenization::conservative("Der alte bzw. der grau-melierte Mann");
28is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
29
30$cons = KorAP::XML::TEI::Tokenization::conservative(". Der");
31is_deeply($cons, [0,1,2,5]);
32
33$cons = KorAP::XML::TEI::Tokenization::conservative(" . Der");
34is_deeply($cons, [1,2,3,6]);
35
36$cons = KorAP::XML::TEI::Tokenization::conservative(" . Der");
37is_deeply($cons, [3,4,5,8]);
38
39$cons = KorAP::XML::TEI::Tokenization::conservative("... Der");
40is_deeply($cons, [0,1,1,2,2,3,4,7]);
41
42$cons = KorAP::XML::TEI::Tokenization::conservative(".Der");
43is_deeply($cons, [1,4]);
44
45$cons = KorAP::XML::TEI::Tokenization::conservative(".Der.... ");
46is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
47
48$cons = KorAP::XML::TEI::Tokenization::conservative("..Der.... ");
49is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
50
Akroneac374d2020-07-07 09:00:44 +020051# Test data
52my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
53my $data = '';
54
55ok(open(FH, '<' . $dataf), 'Open file');
56while (!eof(FH)) {
57 $data .= <FH>
58};
59close(FH);
60
61is(137166, length($data));
62
63$aggr = KorAP::XML::TEI::Tokenization::aggressive($data);
64is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
65is(47242, scalar(@$aggr));
66
67$cons = KorAP::XML::TEI::Tokenization::conservative($data);
68is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
69is(43068, scalar(@$cons));
70
71done_testing;