allow to specify both tokenizations (extern and intern)
Change-Id: I2e3ff83e5122f803c5e4a18a0c1b89b93269d444
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 2b2c6da..9ed2ae3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -355,7 +355,9 @@
$ext_tok->tokenize($data);
- } elsif ( $_GEN_TOK_INT ){
+ }
+
+ if ( $_GEN_TOK_INT ){
$cons_tok->tokenize($data);
$aggr_tok->tokenize($data);
@@ -390,7 +392,9 @@
$text_id_esc
)
- } elsif ( $_GEN_TOK_INT ){
+ }
+
+ if ( $_GEN_TOK_INT ){
# Output token streams to zip streams
$cons_tok->to_zip(
diff --git a/t/script.t b/t/script.t
index 8d17be6..f7f9468 100644
--- a/t/script.t
+++ b/t/script.t
@@ -7,6 +7,11 @@
use Test::More;
use Test::Output;
use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
use Test::KorAP::XML::TEI qw!korap_tempfile!;
my $f = dirname(__FILE__);
@@ -129,6 +134,8 @@
$t = Test::XML::Loy->new($struct_xml);
$t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok(!$zip, 'External not generated');
# Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
@@ -174,19 +181,22 @@
$t->element_count_is('spanList span', 227);
+
# Tokenize with external tokenizer
my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
my ($fh2, $outzip2) = korap_tempfile('script_out2');
stderr_like(
- sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
+ sub { `cat '$file' | perl '$script' -tc='perl $cmd' > '$outzip2'` },
qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
'Processing'
);
# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok($zip, 'Found');
+ok(!$zip->eof, 'Readable');
# Read GOE/AGA/00000/base/tokens.xml
$tokens_xml = '';
@@ -208,10 +218,6 @@
$t->element_count_is('spanList span', 227);
-
-# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
-
-
my ($fh3, $outzip3) = korap_tempfile('script_out3');
@@ -306,4 +312,32 @@
$t->element_count_is('spanList span', 22);
+subtest 'Check Tokenization Flags' => sub {
+
+ # Get external tokenizer
+ my $f = dirname(__FILE__);
+ my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+ my ($fh, $outzip) = korap_tempfile('script_tokflags');
+
+ # Generate zip file (unportable!)
+ stderr_like(
+ sub { `cat '$file' | perl '$script' -ti -tc 'perl $cmd' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+ 'Processing'
+ );
+
+ ok(-e $outzip, "File $outzip exists");
+
+ $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
+ ok($zip, 'Aggressive generated');
+ $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
+ ok($zip, 'Conservative generated');
+ $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+ ok($zip, 'External generated');
+};
+
done_testing;
diff --git a/t/tei.t b/t/tei.t
index 53f372e..6dca05c 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,13 +1,14 @@
use strict;
use warnings;
use Test::More;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
use FindBin;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
use_ok('KorAP::XML::TEI', 'remove_xml_comments');
my ($fh, $filename) = korap_tempfile('tei');
diff --git a/t/tokenization.t b/t/tokenization.t
index 1d75e5f..b132a63 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -3,6 +3,7 @@
use Test::More;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
+use IO::Uncompress::Unzip;
use open qw(:std :utf8); # assume utf-8 encoding
use FindBin;
@@ -10,8 +11,10 @@
unshift @INC, "$FindBin::Bin/../lib";
};
+use_ok('Test::KorAP::XML::TEI','korap_tempfile');
require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
+require_ok('KorAP::XML::TEI::Zipper');
# Test aggressive
my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
@@ -111,4 +114,25 @@
is(302, scalar(@$cons));
+subtest 'Test Zipper' => sub {
+ # Test Zipper
+ my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
+ my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+ $fh->close;
+
+ my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+ $aggr->tokenize("Der alte Mann");
+ ok($aggr->to_zip(
+ $zip->new_stream('tokens.xml'),
+ 'fun'
+ ), 'Written successfully');
+
+ $zip->close;
+
+ ok(-e $outzip, 'Zip exists');
+ my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
+ ok(!$unzip->eof, 'Unzip successful');
+};
+
+
done_testing;
diff --git a/t/zipper.t b/t/zipper.t
index 2ee7fab..86aa52a 100644
--- a/t/zipper.t
+++ b/t/zipper.t
@@ -2,7 +2,6 @@
use warnings;
use Test::More;
use File::Spec::Functions qw/catfile/;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
use IO::Uncompress::Unzip;
use FindBin;
@@ -10,6 +9,8 @@
unshift @INC, "$FindBin::Bin/../lib";
};
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
require_ok('KorAP::XML::TEI::Zipper');
my $data;
@@ -36,7 +37,7 @@
is($data, 'hello', 'Data correct');
-# Uncompress GOE/header.xml from zip file
+# Uncompress data/file2.txt from zip file
$unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
$data = '';
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 3407451..163b85b 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -13,6 +13,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
use KorAP::XML::TEI 'remove_xml_comments';
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Tokenizer::Conservative;
@@ -46,7 +47,7 @@
my $result;
# Data for delHTMLcom-long
-my ($fh, $filename) = tempfile();
+my ($fh, $filename) = korap_tempfile('benchmark');
print $fh <<'HTML';
mehrzeiliger