blob: 92b7cc3c711357a361499d3001a546f99db68dee [file] [log] [blame]
Akroneac374d2020-07-07 09:00:44 +02001use strict;
2use warnings;
3use Test::More;
4use File::Basename 'dirname';
5use File::Spec::Functions qw/catfile/;
Peter Harders42e18a62020-07-21 02:43:26 +02006use IO::Uncompress::Unzip;
Peter Harders994aff72020-07-25 09:53:35 +02007use open qw(:std :utf8); # assume utf-8 encoding
Akroneac374d2020-07-07 09:00:44 +02008
9use FindBin;
10BEGIN {
11 unshift @INC, "$FindBin::Bin/../lib";
12};
13
Peter Harders42e18a62020-07-21 02:43:26 +020014use_ok('Test::KorAP::XML::TEI','korap_tempfile');
Akrond9627472020-07-09 16:53:09 +020015require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
16require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
Peter Harders42e18a62020-07-21 02:43:26 +020017require_ok('KorAP::XML::TEI::Zipper');
Akroneac374d2020-07-07 09:00:44 +020018
19# Test aggressive
Akrond9627472020-07-09 16:53:09 +020020my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
21$aggr->tokenize("Der alte Mann");
Akron510a88c2020-07-07 10:16:50 +020022is_deeply($aggr, [0,3,4,8,9,13]);
23
Akrond9627472020-07-09 16:53:09 +020024$aggr->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020025is_deeply($aggr, [0,3,4,8,9,12,12,13,14,17,18,22,22,23,23,31,32,36]);
Akroneac374d2020-07-07 09:00:44 +020026
Akronedee6e52020-07-27 14:15:11 +020027like(
28 $aggr->reset->tokenize("Der")->to_string('a'),
29 qr!id="t_0"!,
30 'Chainable'
31);
32
Akroneac374d2020-07-07 09:00:44 +020033# Test conservative
Akrond9627472020-07-09 16:53:09 +020034my $cons = KorAP::XML::TEI::Tokenizer::Conservative->new;
35$cons->tokenize("Der alte Mann");
Akroneac374d2020-07-07 09:00:44 +020036is_deeply($cons, [0,3,4,8,9,13]);
37
Akrond9627472020-07-09 16:53:09 +020038$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
Akron510a88c2020-07-07 10:16:50 +020039is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
40
Peter Harders71f072b2020-07-15 14:15:01 +020041$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
42is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
43
Akrond9627472020-07-09 16:53:09 +020044$cons->reset->tokenize(". Der");
Akron510a88c2020-07-07 10:16:50 +020045is_deeply($cons, [0,1,2,5]);
46
Akrond9627472020-07-09 16:53:09 +020047$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020048is_deeply($cons, [1,2,3,6]);
49
Akrond9627472020-07-09 16:53:09 +020050$cons->reset->tokenize(" . Der");
Akron510a88c2020-07-07 10:16:50 +020051is_deeply($cons, [3,4,5,8]);
52
Akrond9627472020-07-09 16:53:09 +020053$cons->reset->tokenize("... Der");
Akron510a88c2020-07-07 10:16:50 +020054is_deeply($cons, [0,1,1,2,2,3,4,7]);
55
Akrond9627472020-07-09 16:53:09 +020056$cons->reset->tokenize(".Der");
Peter Harders854a1152020-07-22 22:48:02 +020057is_deeply($cons, [0,1,1,4]);
Akron510a88c2020-07-07 10:16:50 +020058
Akrond9627472020-07-09 16:53:09 +020059$cons->reset->tokenize(".Der.... ");
Peter Harders854a1152020-07-22 22:48:02 +020060is_deeply($cons, [0,1,1,4,4,5,5,6,6,7,7,8]);
Akron510a88c2020-07-07 10:16:50 +020061
Akrond9627472020-07-09 16:53:09 +020062$cons->reset->tokenize("..Der.... ");
Akron510a88c2020-07-07 10:16:50 +020063is_deeply($cons, [0,1,1,2,2,5,5,6,6,7,7,8,8,9]);
64
Peter Harders854a1152020-07-22 22:48:02 +020065$cons->reset->tokenize(". Der.... ");
66is_deeply($cons, [0,1,2,5,5,6,6,7,7,8,8,9]);
Akroneac374d2020-07-07 09:00:44 +020067
Peter Harders854a1152020-07-22 22:48:02 +020068$cons->reset->tokenize(". .Der.... ");
69is_deeply($cons, [0,1,2,3,3,6,6,7,7,8,8,9,9,10]);
70
71$cons->reset->tokenize("Der\talte\nMann");
72is_deeply($cons, [0,3,4,8,9,13]);
73
Peter Harders854a1152020-07-22 22:48:02 +020074## Test data
Peter Harders994aff72020-07-25 09:53:35 +020075my $dataf = catfile(dirname(__FILE__), 'data', 'wikipedia.txt');
76my $data = '';
77
78ok(open(my $fh, '<' . $dataf), 'Open file wikipedia.txt');
79
80while (!eof($fh)) {
81 $data .= <$fh>
82};
83
84ok(close($fh), 'Close file wikipedia.txt');
85
86is(134996, length($data));
87
88$aggr->reset->tokenize($data);
89is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
90is(47112, scalar(@$aggr));
91
92$cons->reset->tokenize($data);
93is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
94is(42412, scalar(@$cons));
95
Peter Harders854a1152020-07-22 22:48:02 +020096## check tokenization of 'Community-Ämter aufgestiegen'
97## from @{cons}[19518] (=66070) to @{cons}[19519] (=66085) => 'Community-Ämter'
98## from @{cons}[19520] (=66086) to @{cons}[19521] (=66098) => 'aufgestiegen'
Peter Harders994aff72020-07-25 09:53:35 +020099my @vals_got=(66070,66085,66086,66098);
100my @vals_exp; push @vals_exp, @{$cons}[$_] for(19518,19519,19520,19521);
101is_deeply([@vals_exp], [@vals_got]);
102
Peter Harders854a1152020-07-22 22:48:02 +0200103$cons->reset->tokenize("Community-\xc4mter aufgestiegen");
104is_deeply($cons, [0,15,16,28]);
105
Peter Harders994aff72020-07-25 09:53:35 +0200106$dataf = catfile(dirname(__FILE__), 'data', 'wikipedia_small.txt');
107$data = '';
108ok(open($fh, '<' . $dataf), 'Open file wikipedia_small.txt');
Peter Harders1d65f942020-07-22 23:31:00 +0200109while (!eof($fh)) {
110 $data .= <$fh>
Akroneac374d2020-07-07 09:00:44 +0200111};
Peter Harders854a1152020-07-22 22:48:02 +0200112ok(close($fh), 'Close file wikipedia_small.txt');
Akroneac374d2020-07-07 09:00:44 +0200113
Akrond9627472020-07-09 16:53:09 +0200114$aggr->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200115is_deeply([@{$aggr}[0..25]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,49,49,50,50,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200116is(366, scalar(@$aggr));
Akroneac374d2020-07-07 09:00:44 +0200117
Akrond9627472020-07-09 16:53:09 +0200118$cons->reset->tokenize($data);
Peter Harders1d65f942020-07-22 23:31:00 +0200119is_deeply([@{$cons}[0..21]], [1,7,8,12,14,18,19,22,23,27,28,38,39,40,40,57,58,66,67,72,72,73]);
Peter Harders854a1152020-07-22 22:48:02 +0200120is(302, scalar(@$cons));
Peter Harders854a1152020-07-22 22:48:02 +0200121
Akroneac374d2020-07-07 09:00:44 +0200122
Peter Harders42e18a62020-07-21 02:43:26 +0200123subtest 'Test Zipper' => sub {
124 # Test Zipper
125 my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
126 my $zip = KorAP::XML::TEI::Zipper->new($outzip);
127 $fh->close;
128
129 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
130 $aggr->tokenize("Der alte Mann");
131 ok($aggr->to_zip(
132 $zip->new_stream('tokens.xml'),
133 'fun'
134 ), 'Written successfully');
135
136 $zip->close;
137
138 ok(-e $outzip, 'Zip exists');
139 my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
140 ok(!$unzip->eof, 'Unzip successful');
141};
142
143
Akroneac374d2020-07-07 09:00:44 +0200144done_testing;