Merge "Bugfix: intern tokenization"

commit: 4c6ff5b08659b889aeff9cb7ed77eb968d94cab5 [log] [tgz]
author: Peter Harders <harders@ids-mannheim.de> Mon Jul 20 22:36:29 2020 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Mon Jul 20 22:36:29 2020 +0200
tree: 03f0273e22b98b115171f5adaf18088e39e4b2db
parent: 57c884e190823790a0e99c3aa42a6529a48b8349 [diff]
parent: 71f072b3b2874aaba57076c6b34f247d7be5467e [diff]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index dde0146..c94e3cb 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -1073,15 +1073,11 @@
 
         $txt = $e->[1];
 
-        if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
+        # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
+        $cons_tok->tokenize($txt, $offset);
+        $aggr_tok->tokenize($txt, $offset);
 
-          # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
-          $cons_tok->tokenize($txt, $offset);
-          $aggr_tok->tokenize($txt, $offset);
-
-          $offset = $dl;
-
-        }
+        $offset = $dl;
 
 
         #~~~~~

diff --git a/t/data/text_with_blanks.i5.xml b/t/data/text_with_blanks.i5.xml
new file mode 100644
index 0000000..f98e0ee
--- /dev/null
+++ b/t/data/text_with_blanks.i5.xml

@@ -0,0 +1,8 @@
+<idsCorpus>
+  <idsHeader type="text">
+    <textSigle>CORP/DOC.00001</textSigle>
+  </idsHeader>
+  <text>
+   <p>  This is a text, that starts with *2* (not "twenty-two" ;) blanks!</p>
+  </text>
+</idsCorpus>

diff --git a/t/script.t b/t/script.t
index 753326c..85c2cea 100644
--- a/t/script.t
+++ b/t/script.t

@@ -38,8 +38,6 @@
 my ($fh, $outzip) = tempfile("KorAP-XML-TEI_script_XXXXXXXXXX", SUFFIX => ".tmp", TMPDIR => 1, UNLINK => $_UNLINK);
 
 # Generate zip file (unportable!)
-# TODO:
-#   Call with aggressive and conservative tokenizations!
 stderr_like(
   sub { `cat '$file' | perl '$script' > '$outzip'` },
 # approaches for working with $fh (also better use OO interface then)
@@ -191,7 +189,7 @@
   'Processing'
 );
 
-# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+# Uncompress GOE/AGA/00000/base/tokens.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
 
 # Read GOE/AGA/00000/base/tokens.xml
@@ -213,4 +211,100 @@
 
 $t->element_count_is('spanList span', 227);
 
+
+
+# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
+
+
+# ~ test conservative tokenization ~
+
+$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
+
+stderr_like(
+  sub { `cat '$file' | perl '$script' > '$outzip'` },
+  qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
+  'Processing'
+);
+
+ok(-e $outzip, "File $outzip exists");
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_conservative.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $cons = '';
+$cons .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($cons);
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45);          # "
+
+$t->attr_is('spanList span#t_14', 'from', 45);        # twenty-two
+$t->attr_is('spanList span#t_14', 'to', 55);
+
+$t->attr_is('spanList span#t_15', 'from', 55);        # "
+$t->attr_is('spanList span#t_15', 'to', 56);
+
+$t->attr_is('spanList span#t_19', 'from', 66);
+$t->attr_is('spanList span#t_19', 'to', 67);
+
+$t->element_count_is('spanList span', 20);
+
+
+# ~ test aggressive tokenization ~
+
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'CORP/DOC/00001/base/tokens_aggressive.xml');
+
+ok($zip, 'Zip-File is created');
+
+my $aggr = '';
+$aggr .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($aggr);
+
+$t->attr_is('spanList span:nth-child(1)', 'to', 6);
+
+$t->attr_is('spanList span#t_1', 'from', 7);
+$t->attr_is('spanList span#t_1', 'to', 9);
+
+$t->attr_is('spanList span#t_3', 'from', 12);
+$t->attr_is('spanList span#t_3', 'to', 16);
+
+$t->attr_is('spanList span#t_9', 'from', 36);
+$t->attr_is('spanList span#t_9', 'to', 37);
+
+$t->attr_is('spanList span#t_13', 'from', 44);
+$t->attr_is('spanList span#t_13', 'to', 45);          # "
+
+$t->attr_is('spanList span#t_14', 'from', 45);        # twenty
+$t->attr_is('spanList span#t_14', 'to', 51);
+
+$t->attr_is('spanList span#t_15', 'from', 51);        # -
+$t->attr_is('spanList span#t_15', 'to', 52);
+
+$t->attr_is('spanList span#t_16', 'from', 52);        # two
+$t->attr_is('spanList span#t_16', 'to', 55);
+
+$t->attr_is('spanList span#t_17', 'from', 55);        # "
+$t->attr_is('spanList span#t_17', 'to', 56);
+
+$t->attr_is('spanList span#t_21', 'from', 66);
+$t->attr_is('spanList span#t_21', 'to', 67);
+
+$t->element_count_is('spanList span', 22);
+
+
 done_testing;

diff --git a/t/tokenization.t b/t/tokenization.t
index 7469efd..d063eed 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t

@@ -28,6 +28,9 @@
 $cons->reset->tokenize("Der alte bzw. der grau-melierte Mann");
 is_deeply($cons, [0,3,4,8,9,12,12,13,14,17,18,31,32,36]);
 
+$cons->reset->tokenize("  Der alte bzw. der grau-melierte Mann");
+is_deeply($cons, [2,5,6,10,11,14,14,15,16,19,20,33,34,38]);
+
 $cons->reset->tokenize(". Der");
 is_deeply($cons, [0,1,2,5]);
commit	4c6ff5b08659b889aeff9cb7ed77eb968d94cab5	[log] [tgz]
author	Peter Harders <harders@ids-mannheim.de>	Mon Jul 20 22:36:29 2020 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Mon Jul 20 22:36:29 2020 +0200
tree	03f0273e22b98b115171f5adaf18088e39e4b2db
parent	57c884e190823790a0e99c3aa42a6529a48b8349 [diff]
parent	71f072b3b2874aaba57076c6b34f247d7be5467e [diff]