blob: 8a1aba629a65c0ae86c1834949ac920b860e273d [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Peter Harders42e18a62020-07-21 02:43:26 +02006use IO::Uncompress::Unzip;
Akron079f2bd2020-09-09 11:00:26 +02007use utf8;
Peter Harders994aff72020-07-25 09:53:35 +02008use open qw(:std :utf8); # assume utf-8 encoding
Akroneac374d2020-07-07 09:00:44 +02009
10use FindBin;
11BEGIN {
12 unshift @INC, "$FindBin::Bin/../lib";
13};
14
Peter Harders42e18a62020-07-21 02:43:26 +020015use_ok('Test::KorAP::XML::TEI','korap_tempfile');
Akrond9627472020-07-09 16:53:09 +020016require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
17require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Peter Harders42e18a62020-07-21 02:43:26 +020018require_ok('KorAP::XML::TEI::Zipper');
Akroneac374d2020-07-07 09:00:44 +020019
20# Test aggressive
Akrond9627472020-07-09 16:53:09 +020021my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
Akron09e0b2c2020-07-28 15:57:01 +020022ok($aggr->empty, 'Empty');
Akrond9627472020-07-09 16:53:09 +020023$aggr->tokenize("Der alte Mann");
Akron09e0b2c2020-07-28 15:57:01 +020024ok(!$aggr->empty, 'Not empty');
Akron510a88c2020-07-07 10:16:50 +020025is_deeply($aggr, [0,3,4,8,9,13]);
26
Akron09e0b2c2020-07-28 15:57:01 +020027$aggr->reset;
28ok($aggr->empty, 'Empty');
29
30$aggr->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020031is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020032
Akronedee6e52020-07-27 14:15:11 +020033like(
34 $aggr->reset->tokenize("Der")->to_string('a'),
35 qr!id="t_0"!,
36 'Chainable'
37);
38
Akroneac374d2020-07-07 09:00:44 +020039# Test conservative
Akrond9627472020-07-09 16:53:09 +020040my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
41$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020042is_deeply($cons, [0,3,4,8,9,13]);
43
Akron079f2bd2020-09-09 11:00:26 +020044$cons->reset->tokenize("Der ältere Mann");
45is_deeply($cons, [0,3,4,10,11,15]);
46
Akrond9627472020-07-09 16:53:09 +020047$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020048is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
49
Peter Harders71f072b2020-07-15 14:15:01 +020050$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
51is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
52
Akrond9627472020-07-09 16:53:09 +020053$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020054is_deeply($cons, [0,1,2,5]);
55
Akrond9627472020-07-09 16:53:09 +020056$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020057is_deeply($cons, [1,2,3,6]);
58
Akrond9627472020-07-09 16:53:09 +020059$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020060is_deeply($cons, [3,4,5,8]);
61
Akrond9627472020-07-09 16:53:09 +020062$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020063is_deeply($cons, [0,1,1,2,2,3,4,7]);
64
Akrond9627472020-07-09 16:53:09 +020065$cons->reset->tokenize(".Der");
Peter Harders854a1152020-07-22 22:48:02 +020066is_deeply($cons, [0,1,1,4]);
Akron510a88c2020-07-07 10:16:50 +020067
Akrond9627472020-07-09 16:53:09 +020068$cons->reset->tokenize(".Der.... ");
Peter Harders854a1152020-07-22 22:48:02 +020069is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
Akron510a88c2020-07-07 10:16:50 +020070
Akrond9627472020-07-09 16:53:09 +020071$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020072is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
73
Peter Harders854a1152020-07-22 22:48:02 +020074$cons->reset->tokenize(". Der.... ");
75is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
Akroneac374d2020-07-07 09:00:44 +020076
Peter Harders854a1152020-07-22 22:48:02 +020077$cons->reset->tokenize(". .Der.... ");
78is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
79
80$cons->reset->tokenize("Der\talte\nMann");
81is_deeply($cons, [0,3,4,8,9,13]);
82
Peter Harders854a1152020-07-22 22:48:02 +020083## Test data
Peter Harders994aff72020-07-25 09:53:35 +020084my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
85my $data = '';
86
87ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
88
89while (!eof($fh)) {
90 $data .= <$fh>
91};
92
93ok(close($fh), 'Close file wikipedia.txt');
94
95is(134996, length($data));
96
97$aggr->reset->tokenize($data);
98is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
99is(47112, scalar(@$aggr));
100
101$cons->reset->tokenize($data);
102is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
103is(42412, scalar(@$cons));
104
Peter Harders854a1152020-07-22 22:48:02 +0200105## check tokenization of 'Community-Ämter aufgestiegen'
106## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
107## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
Peter Harders994aff72020-07-25 09:53:35 +0200108my @vals_got=(66070,66085,66086,66098);
109my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
110is_deeply([@vals_exp], [@vals_got]);
111
Peter Harders854a1152020-07-22 22:48:02 +0200112$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
113is_deeply($cons, [0,15,16,28]);
114
Peter Harders994aff72020-07-25 09:53:35 +0200115$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
116$data = '';
117ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
Peter Harders1d65f942020-07-22 23:31:00 +0200118while (!eof($fh)) {
119 $data .= <$fh>
Akroneac374d2020-07-07 09:00:44 +0200120};
Peter Harders854a1152020-07-22 22:48:02 +0200121ok(close($fh), 'Close file wikipedia_small.txt');
Akroneac374d2020-07-07 09:00:44 +0200122
Akrond9627472020-07-09 16:53:09 +0200123$aggr->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200124is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200125is(366, scalar(@$aggr));
Akroneac374d2020-07-07 09:00:44 +0200126
Akrond9627472020-07-09 16:53:09 +0200127$cons->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200128is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200129is(302, scalar(@$cons));
Peter Harders854a1152020-07-22 22:48:02 +0200130
Akroneac374d2020-07-07 09:00:44 +0200131
Peter Harders42e18a62020-07-21 02:43:26 +0200132subtest 'Test Zipper' => sub {
133 # Test Zipper
134 my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
Akron3bdc0a32020-08-03 12:12:56 +0200135 my $zip = KorAP::XML::TEI::Zipper->new('', $outzip);
Peter Harders42e18a62020-07-21 02:43:26 +0200136 $fh->close;
137
138 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
139 $aggr->tokenize("Der alte Mann");
140 ok($aggr->to_zip(
141 $zip->new_stream('tokens.xml'),
142 'fun'
143 ), 'Written successfully');
144
145 $zip->close;
146
147 ok(-e $outzip, 'Zip exists');
148 my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
149 ok(!$unzip->eof, 'Unzip successful');
150};
151
152
Akroneac374d2020-07-07 09:00:44 +0200153done_testing;