blob: b132a6308084e5bd98eb483af07ddc55cd3e26c4 [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Peter Harders42e18a62020-07-21 02:43:26 +02006use IO::Uncompress::Unzip;
Peter Harders994aff72020-07-25 09:53:35 +02007use open qw(:std :utf8); # assume utf-8 encoding
Akroneac374d2020-07-07 09:00:44 +02008
9use FindBin;
10BEGIN {
11 unshift @INC, "$FindBin::Bin/../lib";
12};
13
Peter Harders42e18a62020-07-21 02:43:26 +020014use_ok('Test::KorAP::XML::TEI','korap_tempfile');
Akrond9627472020-07-09 16:53:09 +020015require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
16require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Peter Harders42e18a62020-07-21 02:43:26 +020017require_ok('KorAP::XML::TEI::Zipper');
Akroneac374d2020-07-07 09:00:44 +020018
19# Test aggressive
Akrond9627472020-07-09 16:53:09 +020020my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
21$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020022is_deeply($aggr, [0,3,4,8,9,13]);
23
Akrond9627472020-07-09 16:53:09 +020024$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020025is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020026
27# Test conservative
Akrond9627472020-07-09 16:53:09 +020028my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
29$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020030is_deeply($cons, [0,3,4,8,9,13]);
31
Akrond9627472020-07-09 16:53:09 +020032$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020033is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
34
Peter Harders71f072b2020-07-15 14:15:01 +020035$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
36is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
37
Akrond9627472020-07-09 16:53:09 +020038$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020039is_deeply($cons, [0,1,2,5]);
40
Akrond9627472020-07-09 16:53:09 +020041$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020042is_deeply($cons, [1,2,3,6]);
43
Akrond9627472020-07-09 16:53:09 +020044$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020045is_deeply($cons, [3,4,5,8]);
46
Akrond9627472020-07-09 16:53:09 +020047$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020048is_deeply($cons, [0,1,1,2,2,3,4,7]);
49
Akrond9627472020-07-09 16:53:09 +020050$cons->reset->tokenize(".Der");
Peter Harders854a1152020-07-22 22:48:02 +020051is_deeply($cons, [0,1,1,4]);
Akron510a88c2020-07-07 10:16:50 +020052
Akrond9627472020-07-09 16:53:09 +020053$cons->reset->tokenize(".Der.... ");
Peter Harders854a1152020-07-22 22:48:02 +020054is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
Akron510a88c2020-07-07 10:16:50 +020055
Akrond9627472020-07-09 16:53:09 +020056$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020057is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
58
Peter Harders854a1152020-07-22 22:48:02 +020059$cons->reset->tokenize(". Der.... ");
60is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
Akroneac374d2020-07-07 09:00:44 +020061
Peter Harders854a1152020-07-22 22:48:02 +020062$cons->reset->tokenize(". .Der.... ");
63is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
64
65$cons->reset->tokenize("Der\talte\nMann");
66is_deeply($cons, [0,3,4,8,9,13]);
67
Peter Harders854a1152020-07-22 22:48:02 +020068## Test data
Peter Harders994aff72020-07-25 09:53:35 +020069my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
70my $data = '';
71
72ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
73
74while (!eof($fh)) {
75 $data .= <$fh>
76};
77
78ok(close($fh), 'Close file wikipedia.txt');
79
80is(134996, length($data));
81
82$aggr->reset->tokenize($data);
83is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
84is(47112, scalar(@$aggr));
85
86$cons->reset->tokenize($data);
87is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
88is(42412, scalar(@$cons));
89
Peter Harders854a1152020-07-22 22:48:02 +020090## check tokenization of 'Community-Ämter aufgestiegen'
91## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
92## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
Peter Harders994aff72020-07-25 09:53:35 +020093my @vals_got=(66070,66085,66086,66098);
94my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
95is_deeply([@vals_exp], [@vals_got]);
96
Peter Harders854a1152020-07-22 22:48:02 +020097$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
98is_deeply($cons, [0,15,16,28]);
99
Peter Harders994aff72020-07-25 09:53:35 +0200100$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
101$data = '';
102ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
Peter Harders1d65f942020-07-22 23:31:00 +0200103while (!eof($fh)) {
104 $data .= <$fh>
Akroneac374d2020-07-07 09:00:44 +0200105};
Peter Harders854a1152020-07-22 22:48:02 +0200106ok(close($fh), 'Close file wikipedia_small.txt');
Akroneac374d2020-07-07 09:00:44 +0200107
Akrond9627472020-07-09 16:53:09 +0200108$aggr->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200109is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200110is(366, scalar(@$aggr));
Akroneac374d2020-07-07 09:00:44 +0200111
Akrond9627472020-07-09 16:53:09 +0200112$cons->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200113is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200114is(302, scalar(@$cons));
Peter Harders854a1152020-07-22 22:48:02 +0200115
Akroneac374d2020-07-07 09:00:44 +0200116
Peter Harders42e18a62020-07-21 02:43:26 +0200117subtest 'Test Zipper' => sub {
118 # Test Zipper
119 my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
120 my $zip = KorAP::XML::TEI::Zipper->new($outzip);
121 $fh->close;
122
123 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
124 $aggr->tokenize("Der alte Mann");
125 ok($aggr->to_zip(
126 $zip->new_stream('tokens.xml'),
127 'fun'
128 ), 'Written successfully');
129
130 $zip->close;
131
132 ok(-e $outzip, 'Zip exists');
133 my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
134 ok(!$unzip->eof, 'Unzip successful');
135};
136
137
Akroneac374d2020-07-07 09:00:44 +0200138done_testing;