blob: 932407bdbff46ecdbe4ba82ac8f008f2dbb5a400 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
6use File::Temp 'tempfile';
7
8use FindBin;
9BEGIN {
10 unshift @INC, "$FindBin::Bin/../lib";
11};
12
Akrond9627472020-07-09 16:53:09 +020013require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
14require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Akroneac374d2020-07-07 09:00:44 +020015
16# Test aggressive
Akrond9627472020-07-09 16:53:09 +020017my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
18$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020019is_deeply($aggr, [0,3,4,8,9,13]);
20
Akrond9627472020-07-09 16:53:09 +020021$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020022is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020023
24# Test conservative
Akrond9627472020-07-09 16:53:09 +020025my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
26$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020027is_deeply($cons, [0,3,4,8,9,13]);
28
Akrond9627472020-07-09 16:53:09 +020029$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020030is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
31
Akrond9627472020-07-09 16:53:09 +020032$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020033is_deeply($cons, [0,1,2,5]);
34
Akrond9627472020-07-09 16:53:09 +020035$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020036is_deeply($cons, [1,2,3,6]);
37
Akrond9627472020-07-09 16:53:09 +020038$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020039is_deeply($cons, [3,4,5,8]);
40
Akrond9627472020-07-09 16:53:09 +020041$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020042is_deeply($cons, [0,1,1,2,2,3,4,7]);
43
Akrond9627472020-07-09 16:53:09 +020044$cons->reset->tokenize(".Der");
Akron510a88c2020-07-07 10:16:50 +020045is_deeply($cons, [1,4]);
46
Akrond9627472020-07-09 16:53:09 +020047$cons->reset->tokenize(".Der.... ");
Akron510a88c2020-07-07 10:16:50 +020048is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
49
Akrond9627472020-07-09 16:53:09 +020050$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020051is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
52
Akroneac374d2020-07-07 09:00:44 +020053# Test data
54my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
55my $data = '';
56
57ok(open(FH, '<' . $dataf), 'Open file');
58while (!eof(FH)) {
59 $data .= <FH>
60};
61close(FH);
62
63is(137166, length($data));
64
Akrond9627472020-07-09 16:53:09 +020065$aggr->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020066is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
67is(47242, scalar(@$aggr));
68
Akrond9627472020-07-09 16:53:09 +020069$cons->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020070is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
71is(43068, scalar(@$cons));
72
73done_testing;