blob: d063eeda7e4c98a13ef38166578be0b8a62fcea0 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Akroneac374d2020-07-07 09:00:44 +02006
7use FindBin;
8BEGIN {
9 unshift @INC, "$FindBin::Bin/../lib";
10};
11
Akrond9627472020-07-09 16:53:09 +020012require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
13require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Akroneac374d2020-07-07 09:00:44 +020014
15# Test aggressive
Akrond9627472020-07-09 16:53:09 +020016my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
17$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020018is_deeply($aggr, [0,3,4,8,9,13]);
19
Akrond9627472020-07-09 16:53:09 +020020$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020021is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020022
23# Test conservative
Akrond9627472020-07-09 16:53:09 +020024my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
25$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020026is_deeply($cons, [0,3,4,8,9,13]);
27
Akrond9627472020-07-09 16:53:09 +020028$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020029is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
30
Peter Harders71f072b2020-07-15 14:15:01 +020031$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
32is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
33
Akrond9627472020-07-09 16:53:09 +020034$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020035is_deeply($cons, [0,1,2,5]);
36
Akrond9627472020-07-09 16:53:09 +020037$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020038is_deeply($cons, [1,2,3,6]);
39
Akrond9627472020-07-09 16:53:09 +020040$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020041is_deeply($cons, [3,4,5,8]);
42
Akrond9627472020-07-09 16:53:09 +020043$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020044is_deeply($cons, [0,1,1,2,2,3,4,7]);
45
Peter Harders41c35622020-07-12 01:16:22 +020046# TODO:
47# bug: '.' is not tokenized
Akrond9627472020-07-09 16:53:09 +020048$cons->reset->tokenize(".Der");
Akron510a88c2020-07-07 10:16:50 +020049is_deeply($cons, [1,4]);
50
Akrond9627472020-07-09 16:53:09 +020051$cons->reset->tokenize(".Der.... ");
Akron510a88c2020-07-07 10:16:50 +020052is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
53
Akrond9627472020-07-09 16:53:09 +020054$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020055is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
56
Akroneac374d2020-07-07 09:00:44 +020057# Test data
58my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
59my $data = '';
60
61ok(open(FH, '<' . $dataf), 'Open file');
62while (!eof(FH)) {
63 $data .= <FH>
64};
65close(FH);
66
67is(137166, length($data));
68
Akrond9627472020-07-09 16:53:09 +020069$aggr->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020070is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
71is(47242, scalar(@$aggr));
72
Akrond9627472020-07-09 16:53:09 +020073$cons->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020074is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
75is(43068, scalar(@$cons));
76
77done_testing;