Bugfix: intern tokenization
retr_info(): $_GEN_TOK_INT: text starting with 2 blanks is ignored
Change-Id: I06e8a157bc566fca4b44737230f9b3dc236d0a98
diff --git a/script/tei2korapxml b/script/tei2korapxml
index dde0146..c94e3cb 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -1073,15 +1073,11 @@
$txt = $e->[1];
- if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
+ # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
+ $cons_tok->tokenize($txt, $offset);
+ $aggr_tok->tokenize($txt, $offset);
- # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
- $cons_tok->tokenize($txt, $offset);
- $aggr_tok->tokenize($txt, $offset);
-
- $offset = $dl;
-
- }
+ $offset = $dl;
#~~~~~
diff --git a/t/data/text_with_blanks.i5.xml b/t/data/text_with_blanks.i5.xml
new file mode 100644
index 0000000..f98e0ee
--- /dev/null
+++ b/t/data/text_with_blanks.i5.xml
@@ -0,0 +1,8 @@
+<idsCorpus>
+ <idsHeader type="text">
+ <textSigle>CORP/DOC.00001</textSigle>
+ </idsHeader>
+ <text>
+ <p> This is a text, that starts with *2* (not "twenty-two" ;) blanks!</p>
+ </text>
+</idsCorpus>
diff --git a/t/script.t b/t/script.t
index 5010789..eaf6348 100644
--- a/t/script.t
+++ b/t/script.t
@@ -32,8 +32,6 @@
my $outzip = tmpnam();
# Generate zip file (unportable!)
-# TODO:
-# Call with aggressive and conservative tokenizations!
stderr_like(
sub { `cat '$file' | perl '$script' > '$outzip'` },
qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
@@ -171,7 +169,7 @@
'Processing'
);
-# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
# Read GOE/AGA/00000/base/tokens.xml
@@ -193,4 +191,100 @@
$t->element_count_is('spanList span', 227);
+
+
+# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
+
+
+# ~ test conservative tokenization ~
+
+$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
+
+stderr_like(
+ sub { `cat '$file' | perl '$script' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
+ 'Processing'
+);
+
+ok(-e $outzip, "File $outzip exists");
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $cons = '';
+$cons .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($cons);
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45); # "
+
+$t->attr_is('spanList span#t_14', 'from', 45); # twenty-two
+$t->attr_is('spanList span#t_14', 'to', 55);
+
+$t->attr_is('spanList span#t_15', 'from', 55); # "
+$t->attr_is('spanList span#t_15', 'to', 56);
+
+$t->attr_is('spanList span#t_19', 'from', 66);
+$t->attr_is('spanList span#t_19', 'to', 67);
+
+$t->element_count_is('spanList span', 20);
+
+
+# ~ test aggressive tokenization ~
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $aggr = '';
+$aggr .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($aggr);
+
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45); # "
+
+$t->attr_is('spanList span#t_14', 'from', 45); # twenty
+$t->attr_is('spanList span#t_14', 'to', 51);
+
+$t->attr_is('spanList span#t_15', 'from', 51); # -
+$t->attr_is('spanList span#t_15', 'to', 52);
+
+$t->attr_is('spanList span#t_16', 'from', 52); # two
+$t->attr_is('spanList span#t_16', 'to', 55);
+
+$t->attr_is('spanList span#t_17', 'from', 55); # "
+$t->attr_is('spanList span#t_17', 'to', 56);
+
+$t->attr_is('spanList span#t_21', 'from', 66);
+$t->attr_is('spanList span#t_21', 'to', 67);
+
+$t->element_count_is('spanList span', 22);
+
+
done_testing;
diff --git a/t/tokenization.t b/t/tokenization.t
index 5332196..dfe05bb 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -29,6 +29,9 @@
$cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
+$cons->reset->tokenize(" Der alte bzw. der grau-melierte Mann");
+is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
+
$cons->reset->tokenize(". Der");
is_deeply($cons, [0,1,2,5]);