blob: 53321962dc8e15d5c62a4f459b7a66c7f271e688 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
6use File::Temp 'tempfile';
7
8use FindBin;
9BEGIN {
10 unshift @INC, "$FindBin::Bin/../lib";
11};
12
Akrond9627472020-07-09 16:53:09 +020013require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
14require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Akroneac374d2020-07-07 09:00:44 +020015
16# Test aggressive
Akrond9627472020-07-09 16:53:09 +020017my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
18$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020019is_deeply($aggr, [0,3,4,8,9,13]);
20
Akrond9627472020-07-09 16:53:09 +020021$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020022is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020023
24# Test conservative
Akrond9627472020-07-09 16:53:09 +020025my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
26$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020027is_deeply($cons, [0,3,4,8,9,13]);
28
Akrond9627472020-07-09 16:53:09 +020029$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020030is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
31
Akrond9627472020-07-09 16:53:09 +020032$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020033is_deeply($cons, [0,1,2,5]);
34
Akrond9627472020-07-09 16:53:09 +020035$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020036is_deeply($cons, [1,2,3,6]);
37
Akrond9627472020-07-09 16:53:09 +020038$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020039is_deeply($cons, [3,4,5,8]);
40
Akrond9627472020-07-09 16:53:09 +020041$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020042is_deeply($cons, [0,1,1,2,2,3,4,7]);
43
Peter Harders41c35622020-07-12 01:16:22 +020044# TODO:
45# bug: '.' is not tokenized
Akrond9627472020-07-09 16:53:09 +020046$cons->reset->tokenize(".Der");
Akron510a88c2020-07-07 10:16:50 +020047is_deeply($cons, [1,4]);
48
Akrond9627472020-07-09 16:53:09 +020049$cons->reset->tokenize(".Der.... ");
Akron510a88c2020-07-07 10:16:50 +020050is_deeply($cons, [1,4,4,5,5,6,6,7,7,8]);
51
Akrond9627472020-07-09 16:53:09 +020052$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020053is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
54
Akroneac374d2020-07-07 09:00:44 +020055# Test data
56my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
57my $data = '';
58
59ok(open(FH, '<' . $dataf), 'Open file');
60while (!eof(FH)) {
61 $data .= <FH>
62};
63close(FH);
64
65is(137166, length($data));
66
Akrond9627472020-07-09 16:53:09 +020067$aggr->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020068is_deeply([@{$aggr}[0..7]], [1,7,8,12,14,18,19,22]);
69is(47242, scalar(@$aggr));
70
Akrond9627472020-07-09 16:53:09 +020071$cons->reset->tokenize($data);
Akroneac374d2020-07-07 09:00:44 +020072is_deeply([@{$cons}[0..7]], [1,7,8,12,14,18,19,22]);
73is(43068, scalar(@$cons));
74
75done_testing;