blob: 1d75e5fef2723c5cad8a61aacfe294563daf7807 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Peter Harders994aff72020-07-25 09:53:35 +02006use open qw(:std :utf8); # assume utf-8 encoding
Akroneac374d2020-07-07 09:00:44 +02007
8use FindBin;
9BEGIN {
10 unshift @INC, "$FindBin::Bin/../lib";
11};
12
Akrond9627472020-07-09 16:53:09 +020013require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
14require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Akroneac374d2020-07-07 09:00:44 +020015
16# Test aggressive
Akrond9627472020-07-09 16:53:09 +020017my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
18$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020019is_deeply($aggr, [0,3,4,8,9,13]);
20
Akrond9627472020-07-09 16:53:09 +020021$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020022is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020023
24# Test conservative
Akrond9627472020-07-09 16:53:09 +020025my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
26$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020027is_deeply($cons, [0,3,4,8,9,13]);
28
Akrond9627472020-07-09 16:53:09 +020029$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020030is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
31
Peter Harders71f072b2020-07-15 14:15:01 +020032$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
33is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
34
Akrond9627472020-07-09 16:53:09 +020035$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020036is_deeply($cons, [0,1,2,5]);
37
Akrond9627472020-07-09 16:53:09 +020038$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020039is_deeply($cons, [1,2,3,6]);
40
Akrond9627472020-07-09 16:53:09 +020041$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020042is_deeply($cons, [3,4,5,8]);
43
Akrond9627472020-07-09 16:53:09 +020044$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020045is_deeply($cons, [0,1,1,2,2,3,4,7]);
46
Akrond9627472020-07-09 16:53:09 +020047$cons->reset->tokenize(".Der");
Peter Harders854a1152020-07-22 22:48:02 +020048is_deeply($cons, [0,1,1,4]);
Akron510a88c2020-07-07 10:16:50 +020049
Akrond9627472020-07-09 16:53:09 +020050$cons->reset->tokenize(".Der.... ");
Peter Harders854a1152020-07-22 22:48:02 +020051is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
Akron510a88c2020-07-07 10:16:50 +020052
Akrond9627472020-07-09 16:53:09 +020053$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020054is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
55
Peter Harders854a1152020-07-22 22:48:02 +020056$cons->reset->tokenize(". Der.... ");
57is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
Akroneac374d2020-07-07 09:00:44 +020058
Peter Harders854a1152020-07-22 22:48:02 +020059$cons->reset->tokenize(". .Der.... ");
60is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
61
62$cons->reset->tokenize("Der\talte\nMann");
63is_deeply($cons, [0,3,4,8,9,13]);
64
Peter Harders854a1152020-07-22 22:48:02 +020065## Test data
Peter Harders994aff72020-07-25 09:53:35 +020066my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
67my $data = '';
68
69ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
70
71while (!eof($fh)) {
72 $data .= <$fh>
73};
74
75ok(close($fh), 'Close file wikipedia.txt');
76
77is(134996, length($data));
78
79$aggr->reset->tokenize($data);
80is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
81is(47112, scalar(@$aggr));
82
83$cons->reset->tokenize($data);
84is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
85is(42412, scalar(@$cons));
86
Peter Harders854a1152020-07-22 22:48:02 +020087## check tokenization of 'Community-Ämter aufgestiegen'
88## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
89## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
Peter Harders994aff72020-07-25 09:53:35 +020090my @vals_got=(66070,66085,66086,66098);
91my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
92is_deeply([@vals_exp], [@vals_got]);
93
Peter Harders854a1152020-07-22 22:48:02 +020094$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
95is_deeply($cons, [0,15,16,28]);
96
Peter Harders994aff72020-07-25 09:53:35 +020097$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
98$data = '';
99ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
Peter Harders1d65f942020-07-22 23:31:00 +0200100while (!eof($fh)) {
101 $data .= <$fh>
Akroneac374d2020-07-07 09:00:44 +0200102};
Peter Harders854a1152020-07-22 22:48:02 +0200103ok(close($fh), 'Close file wikipedia_small.txt');
Akroneac374d2020-07-07 09:00:44 +0200104
Akrond9627472020-07-09 16:53:09 +0200105$aggr->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200106is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200107is(366, scalar(@$aggr));
Akroneac374d2020-07-07 09:00:44 +0200108
Akrond9627472020-07-09 16:53:09 +0200109$cons->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200110is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200111is(302, scalar(@$cons));
Peter Harders854a1152020-07-22 22:48:02 +0200112
Akroneac374d2020-07-07 09:00:44 +0200113
114done_testing;