Merge "Bugfix: intern tokenization"
diff --git a/script/tei2korapxml b/script/tei2korapxml
index dde0146..c94e3cb 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -1073,15 +1073,11 @@
$txt = $e->[1];
- if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
+ # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
+ $cons_tok->tokenize($txt, $offset);
+ $aggr_tok->tokenize($txt, $offset);
- # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
- $cons_tok->tokenize($txt, $offset);
- $aggr_tok->tokenize($txt, $offset);
-
- $offset = $dl;
-
- }
+ $offset = $dl;
#~~~~~
diff --git a/t/data/text_with_blanks.i5.xml b/t/data/text_with_blanks.i5.xml
new file mode 100644
index 0000000..f98e0ee
--- /dev/null
+++ b/t/data/text_with_blanks.i5.xml
@@ -0,0 +1,8 @@
+<idsCorpus>
+ <idsHeader type="text">
+ <textSigle>CORP/DOC.00001</textSigle>
+ </idsHeader>
+ <text>
+ <p> This is a text, that starts with *2* (not "twenty-two" ;) blanks!</p>
+ </text>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index 753326c..85c2cea 100644
--- a/t/script.t
+++ b/t/script.t
@@ -38,8 +38,6 @@
my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
# Generate zip file (unportable!)
-# TODO:
-# Call with aggressive and conservative tokenizations!
stderr_like(
sub { `cat '$file' | perl '$script' > '$outzip'` },
# approaches for working with $fh (also better use OO interface then)
@@ -191,7 +189,7 @@
'Processing'
);
-# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
# Read GOE/AGA/00000/base/tokens.xml
@@ -213,4 +211,100 @@
$t->element_count_is('spanList span', 227);
+
+
+# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
+
+
+# ~ test conservative tokenization ~
+
+$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
+
+stderr_like(
+ sub { `cat '$file' | perl '$script' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
+ 'Processing'
+);
+
+ok(-e $outzip, "File $outzip exists");
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $cons = '';
+$cons .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($cons);
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45); # "
+
+$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
+$t->attr_is('spanList span#t_14', 'to', 55);
+
+$t->attr_is('spanList span#t_15', 'from', 55); # "
+$t->attr_is('spanList span#t_15', 'to', 56);
+
+$t->attr_is('spanList span#t_19', 'from', 66);
+$t->attr_is('spanList span#t_19', 'to', 67);
+
+$t->element_count_is('spanList span', 20);
+
+
+# ~ test aggressive tokenization ~
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $aggr = '';
+$aggr .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($aggr);
+
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45); # "
+
+$t->attr_is('spanList span#t_14', 'from', 45); # twenty
+$t->attr_is('spanList span#t_14', 'to', 51);
+
+$t->attr_is('spanList span#t_15', 'from', 51); # -
+$t->attr_is('spanList span#t_15', 'to', 52);
+
+$t->attr_is('spanList span#t_16', 'from', 52); # two
+$t->attr_is('spanList span#t_16', 'to', 55);
+
+$t->attr_is('spanList span#t_17', 'from', 55); # "
+$t->attr_is('spanList span#t_17', 'to', 56);
+
+$t->attr_is('spanList span#t_21', 'from', 66);
+$t->attr_is('spanList span#t_21', 'to', 67);
+
+$t->element_count_is('spanList span', 22);
+
+
done_testing;
diff --git a/t/tokenization.t b/t/tokenization.t
index 7469efd..d063eed 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -28,6 +28,9 @@
$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
+$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
+is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
+
$cons->reset->tokenize(". Der");
is_deeply($cons, [0,1,2,5]);