blob: 7469efdc6aa6a6ba41f947b654800962686b7335 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Akroneac374d2020-07-07 09:00:44 +02006
7use FindBin;
8BEGIN {
9 unshift @INC, "$FindBin::Bin/../lib";
10};
11
Akrond9627472020-07-09 16:53:09 +020012require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
13require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Akroneac374d2020-07-07 09:00:44 +020014
15# Test aggressive
Akrond9627472020-07-09 16:53:09 +020016my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
17$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020018is_deeply($aggr, [0,3,4,8,9,13]);
19
Akrond9627472020-07-09 16:53:09 +020020$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020021is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020022
23# Test conservative
Akrond9627472020-07-09 16:53:09 +020024my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
25$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020026is_deeply($cons, [0,3,4,8,9,13]);
27
Akrond9627472020-07-09 16:53:09 +020028$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020029is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
30
Akrond9627472020-07-09 16:53:09 +020031$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020032is_deeply($cons, [0,1,2,5]);
33
Akrond9627472020-07-09 16:53:09 +020034$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020035is_deeply($cons, [1,2,3,6]);
36
Akrond9627472020-07-09 16:53:09 +020037$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020038is_deeply($cons, [3,4,5,8]);
39
Akrond9627472020-07-09 16:53:09 +020040$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020041is_deeply($cons, [0,1,1,2,2,3,4,7]);
42
Peter Harders41c35622020-07-12 01:16:22 +020043# TODO:
44# bug: '.' is not tokenized
Akrond9627472020-07-09 16:53:09 +020045$cons->reset->tokenize(".Der");
Akron510a88c2020-07-07 10:16:50 +020046is_deeply($cons, [1,4]);
47
Akrond9627472020-07-09 16:53:09 +020048$cons->reset->tokenize(".Der.... ");
Akron510a88c2020-07-07 10:16:50 +020049is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
50
Akrond9627472020-07-09 16:53:09 +020051$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020052is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
53
Akroneac374d2020-07-07 09:00:44 +020054# Test data
55my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
56my $data = '';
57
58ok(open(FH, '<' . $dataf), 'Open file');
59while (!eof(FH)) {
60 $data .= <FH>
61};
62close(FH);
63
64is(137166, length($data));
65
Akrond9627472020-07-09 16:53:09 +020066$aggr->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020067is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
68is(47242, scalar(@$aggr));
69
Akrond9627472020-07-09 16:53:09 +020070$cons->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020071is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
72is(43068, scalar(@$cons));
73
74done_testing;