blob: 9d986a0385f711a868ae425a3fc4ee39e50641cf [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Peter Harders42e18a62020-07-21 02:43:26 +02006use IO::Uncompress::Unzip;
Peter Harders994aff72020-07-25 09:53:35 +02007use open qw(:std :utf8); # assume utf-8 encoding
Akroneac374d2020-07-07 09:00:44 +02008
9use FindBin;
10BEGIN {
11 unshift @INC, "$FindBin::Bin/../lib";
12};
13
Peter Harders42e18a62020-07-21 02:43:26 +020014use_ok('Test::KorAP::XML::TEI','korap_tempfile');
Akrond9627472020-07-09 16:53:09 +020015require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
16require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Peter Harders42e18a62020-07-21 02:43:26 +020017require_ok('KorAP::XML::TEI::Zipper');
Akroneac374d2020-07-07 09:00:44 +020018
19# Test aggressive
Akrond9627472020-07-09 16:53:09 +020020my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron09e0b2c2020-07-28 15:57:01 +020021ok($aggr->empty, 'Empty');
Akrond9627472020-07-09 16:53:09 +020022$aggr->tokenize("Der alte Mann");
Akron09e0b2c2020-07-28 15:57:01 +020023ok(!$aggr->empty, 'Not empty');
Akron510a88c2020-07-07 10:16:50 +020024is_deeply($aggr, [0,3,4,8,9,13]);
25
Akron09e0b2c2020-07-28 15:57:01 +020026$aggr->reset;
27ok($aggr->empty, 'Empty');
28
29$aggr->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020030is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020031
Akronedee6e52020-07-27 14:15:11 +020032like(
33 $aggr->reset->tokenize("Der")->to_string('a'),
34 qr!id="t_0"!,
35 'Chainable'
36);
37
Akroneac374d2020-07-07 09:00:44 +020038# Test conservative
Akrond9627472020-07-09 16:53:09 +020039my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
40$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020041is_deeply($cons, [0,3,4,8,9,13]);
42
Akrond9627472020-07-09 16:53:09 +020043$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020044is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
45
Peter Harders71f072b2020-07-15 14:15:01 +020046$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
47is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
48
Akrond9627472020-07-09 16:53:09 +020049$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020050is_deeply($cons, [0,1,2,5]);
51
Akrond9627472020-07-09 16:53:09 +020052$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020053is_deeply($cons, [1,2,3,6]);
54
Akrond9627472020-07-09 16:53:09 +020055$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020056is_deeply($cons, [3,4,5,8]);
57
Akrond9627472020-07-09 16:53:09 +020058$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020059is_deeply($cons, [0,1,1,2,2,3,4,7]);
60
Akrond9627472020-07-09 16:53:09 +020061$cons->reset->tokenize(".Der");
Peter Harders854a1152020-07-22 22:48:02 +020062is_deeply($cons, [0,1,1,4]);
Akron510a88c2020-07-07 10:16:50 +020063
Akrond9627472020-07-09 16:53:09 +020064$cons->reset->tokenize(".Der.... ");
Peter Harders854a1152020-07-22 22:48:02 +020065is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
Akron510a88c2020-07-07 10:16:50 +020066
Akrond9627472020-07-09 16:53:09 +020067$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020068is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
69
Peter Harders854a1152020-07-22 22:48:02 +020070$cons->reset->tokenize(". Der.... ");
71is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
Akroneac374d2020-07-07 09:00:44 +020072
Peter Harders854a1152020-07-22 22:48:02 +020073$cons->reset->tokenize(". .Der.... ");
74is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
75
76$cons->reset->tokenize("Der\talte\nMann");
77is_deeply($cons, [0,3,4,8,9,13]);
78
Peter Harders854a1152020-07-22 22:48:02 +020079## Test data
Peter Harders994aff72020-07-25 09:53:35 +020080my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
81my $data = '';
82
83ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
84
85while (!eof($fh)) {
86 $data .= <$fh>
87};
88
89ok(close($fh), 'Close file wikipedia.txt');
90
91is(134996, length($data));
92
93$aggr->reset->tokenize($data);
94is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
95is(47112, scalar(@$aggr));
96
97$cons->reset->tokenize($data);
98is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
99is(42412, scalar(@$cons));
100
Peter Harders854a1152020-07-22 22:48:02 +0200101## check tokenization of 'Community-Ämter aufgestiegen'
102## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
103## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
Peter Harders994aff72020-07-25 09:53:35 +0200104my @vals_got=(66070,66085,66086,66098);
105my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
106is_deeply([@vals_exp], [@vals_got]);
107
Peter Harders854a1152020-07-22 22:48:02 +0200108$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
109is_deeply($cons, [0,15,16,28]);
110
Peter Harders994aff72020-07-25 09:53:35 +0200111$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
112$data = '';
113ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
Peter Harders1d65f942020-07-22 23:31:00 +0200114while (!eof($fh)) {
115 $data .= <$fh>
Akroneac374d2020-07-07 09:00:44 +0200116};
Peter Harders854a1152020-07-22 22:48:02 +0200117ok(close($fh), 'Close file wikipedia_small.txt');
Akroneac374d2020-07-07 09:00:44 +0200118
Akrond9627472020-07-09 16:53:09 +0200119$aggr->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200120is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200121is(366, scalar(@$aggr));
Akroneac374d2020-07-07 09:00:44 +0200122
Akrond9627472020-07-09 16:53:09 +0200123$cons->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200124is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200125is(302, scalar(@$cons));
Peter Harders854a1152020-07-22 22:48:02 +0200126
Akroneac374d2020-07-07 09:00:44 +0200127
Peter Harders42e18a62020-07-21 02:43:26 +0200128subtest 'Test Zipper' => sub {
129 # Test Zipper
130 my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
131 my $zip = KorAP::XML::TEI::Zipper->new($outzip);
132 $fh->close;
133
134 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
135 $aggr->tokenize("Der alte Mann");
136 ok($aggr->to_zip(
137 $zip->new_stream('tokens.xml'),
138 'fun'
139 ), 'Written successfully');
140
141 $zip->close;
142
143 ok(-e $outzip, 'Zip exists');
144 my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
145 ok(!$unzip->eof, 'Unzip successful');
146};
147
148
Akroneac374d2020-07-07 09:00:44 +0200149done_testing;