Merge changes Id3fbb94a,Ib43733cf,I992fe374 * changes: Zip data.xml before tokens.xml Do not escape double quoutes inside raw_text elements Add -tk option to use the standard KoAP tokenizer

commit: 41021abd841594bbb18bc8094df6950620daf5df [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Sep 28 10:08:37 2020 +0200
committer: Gerrit Code Review <gerrit2@korap.ids-mannheim.de> Mon Sep 28 10:08:37 2020 +0200
tree: 9b69f7066f2987ab6dca4264d8fb96903cc588dd
parent: 7501ca0ecfa651f0add20fc8e8d959bb35c1fc54 [diff]
parent: 74ed7f349be99f68d36402fa94480c56a447467a [diff]
diff --git a/Makefile.PL b/Makefile.PL
index 918ef02..25eae0a 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -3,6 +3,9 @@
 use strict;
 use warnings;
 use ExtUtils::MakeMaker;
+use File::ShareDir::Install;
+
+install_share dist => 'share';
 
 WriteMakefile(
   NAME         => 'tei2korapxml',
@@ -21,6 +24,7 @@
     'Capture::Tiny' => '0.48'
   },
   PREREQ_PM => {
+    'File::Share' => '0.25',
     'XML::CompactTree::XS'     => '0.03',
     'XML::LibXML::Reader' => '2.0201',
     'IO::Compress::Zip' => '2.091',
@@ -29,3 +33,6 @@
   MIN_PERL_VERSION => '5.016',
   EXE_FILES => ['script/tei2korapxml']
 );
+
+package MY;
+use File::ShareDir::Install 'postamble';

diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 23b6625..8f1678d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm

@@ -3,16 +3,21 @@
 use warnings;
 
 use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
 
 # convert '&', '<' and '>' into their corresponding sgml-entities
-my %ent = (
-  '"' => '&quot;',
+my %ent_without_quot = (
   '&' => '&amp;',
   '<' => '&lt;',
   '>' => '&gt;'
 );
 
+my %ent = (
+  %ent_without_quot,
+  '"' => '&quot;'
+);
+
+
 # remove xml comments
 sub remove_xml_comments {
   my ($fh, $html) = @_;
@@ -71,4 +76,12 @@
 };
 
 
+# Escape
+sub escape_xml_minimal {
+  my $data = shift // '';
+  $data =~ s/([&<>])/$ent_without_quot{$1}/ge;
+  return $data;
+};
+
+
 1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 10a4a41..9a09ec7 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -5,6 +5,7 @@
 use Log::Any qw($log);
 use IO::Select;
 use IPC::Open2 qw(open2);
+use Encode qw(encode);
 
 # This tokenizer starts an external process for
 # tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
 sub new {
   my ($class, $cmd, $sep) = @_;
 
-  # e.g. 'java  -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
-  #      " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
   unless ($cmd) {
     $log->warn('Tokenizer not established');
     return;
@@ -54,7 +52,7 @@
   my ($self, $txt) = @_;
   return unless $self->{pid};
   my $out = $self->{chld_in};
-  print $out $txt . $self->{sep};
+  print $out encode( "UTF-8", $txt ) . $self->{sep};
   return $self;
 };
 
@@ -128,8 +126,9 @@
 
       if (defined $_ && $_ ne '') {
 
-        # This warning is sometimes thrown, though not yet replicated in the test suite.
-        # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+        # This warning is sometimes thrown, though not yet replicated
+        # in the test suite. See the discussion in gerrit (3123:
+        # Establish tokenizer object for external base tokenization)
         # for further issues.
         $log->warn("Extra output: $_");
       }

diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm

@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+  WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+  warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+  return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+  'tei2korapxml',
+  'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+  my $class = shift;
+  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+  $self->{name} = 'korap';
+  $self->{sep} = "\x{04}\n";
+  return bless $self, $class;
+};
+
+
+1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 96cddcf..8066b4c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -20,7 +20,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -28,6 +28,10 @@
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
+eval {
+  require KorAP::XML::TEI::Tokenizer::KorAP;
+  1;
+};
 
 our $VERSION = '0.01';
 
@@ -39,6 +43,7 @@
   "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+  'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
   'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
@@ -76,13 +81,18 @@
 #
 
 ## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
+my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
+
   # TODO:
   #   Read tokenizer call from configuration file.
   #   was 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
   my $ext_tok;
   if ($tokenizer_call) {
     $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+  }
+
+  elsif ($tokenizer_korap) {
+    $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
   };
   my $_tok_file_ext  = "tokens.xml";
 ##
@@ -318,6 +328,20 @@
             #
 
 
+            # Encode and escape data
+            my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
+            # note: the index still refers to the 'single character'-versions,
+            # which are counted as 1 (search for '&amp;' in data.xml and see
+            # corresponding indices in $_tokens_file)
+
+            if ($_DEBUG) {
+              $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+            };
+
+            $zipper->new_stream("$dir/$_data_file")
+              ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
+
+
             # ~ tokenization ~
 
             if ( $_GEN_TOK_EXT ){
@@ -346,19 +370,6 @@
               $cons_tok->reset;
             };
 
-            # Encode and escape data
-            $data = escape_xml(encode( "UTF-8", $data ));
-            # note: the index still refers to the 'single character'-versions,
-            # which are counted as 1 (search for '&amp;' in data.xml and see
-            # corresponding indices in $_tokens_file)
-
-            if ($_DEBUG) {
-              $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
-            };
-
-            $zipper->new_stream("$dir/$_data_file")
-              ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
-
             # ~ write structures ~
             if (!$structures->empty) {
               $structures->to_zip(
@@ -885,6 +896,10 @@
 Call an external tokenizer process, that will tokenize
 a single line from STDIN and outputs one token per line.
 
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
 =item B<--use-intern-tokenization|-ti>
 
 Tokenize the data using two embedded tokenizers,

diff --git a/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
new file mode 100644
index 0000000..748b341
--- /dev/null
+++ b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
Binary files differ

diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index bf948e8..4e8b2d0 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl

@@ -2,6 +2,7 @@
 use strict;
 use warnings;
 use FindBin;
+use Encode;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../../lib";
 };
@@ -14,7 +15,7 @@
 
 # Read lines from input and return boundaries
 while (!eof(STDIN)) {
-  my $line = <>;
+  my $line = decode_utf8(<>);
   for my $text (split(/\n?\x{04}\n?/, $line)) {
     $tok->tokenize($text);
     print join(' ', $tok->boundaries), "\n";

diff --git a/t/script.t b/t/script.t
index 4254937..9521d73 100644
--- a/t/script.t
+++ b/t/script.t

@@ -66,7 +66,8 @@
 # Uncompress GOE/AGA/00000/data.xml from zip file
   $t->unzip_xml('GOE/AGA/00000/data.xml')
     ->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
-    ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
+    ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content')
+    ->text_like('raw_text > text', qr!unter dem Titel "Kriegstheater"!, 'text content');
 
   $t->unzip_xml('GOE/AGA/00000/struct/structure.xml')
     ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')

diff --git a/t/tei.t b/t/tei.t
index 94f7577..69b4ee1 100644
--- a/t/tei.t
+++ b/t/tei.t

@@ -9,7 +9,7 @@
 
 use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
 
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
 
 subtest 'remove_xml_comments' => sub {
   my ($fh, $filename) = korap_tempfile('tei');
@@ -87,5 +87,31 @@
   );
 };
 
+subtest 'escape_xml_minimal' => sub {
+  is(
+      escape_xml_minimal('"""'),
+      '"""'
+  );
+
+  is(
+      escape_xml_minimal('&&&'),
+      '&amp;&amp;&amp;'
+  );
+
+  is(
+      escape_xml_minimal('<<<'),
+      '&lt;&lt;&lt;'
+  );
+
+  is(
+      escape_xml_minimal('>>>'),
+      '&gt;&gt;&gt;'
+  );
+
+  is(
+      escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'),
+      '&lt;tag att1="foo" att2="bar"&gt;C&amp;A&lt;/tag&gt;'
+  );
+};
 
 done_testing;

diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
new file mode 100644
index 0000000..809dd45
--- /dev/null
+++ b/t/tokenization-korap.t

@@ -0,0 +1,66 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use Test::XML::Loy;
+
+use FindBin;
+use utf8;
+
+BEGIN {
+  eval {
+    require KorAP::XML::TEI::Tokenizer::KorAP;
+    1;
+  } or do {
+    plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+  };
+}
+
+require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+
+$ext->tokenize("Der alte Mann");
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
+$ext->reset;
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+my $string = "Pluto.\"  Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
+$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
+$t->element_count_is('layer spanList span', 14);
+done_testing;
commit	41021abd841594bbb18bc8094df6950620daf5df	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Sep 28 10:08:37 2020 +0200
committer	Gerrit Code Review <gerrit2@korap.ids-mannheim.de>	Mon Sep 28 10:08:37 2020 +0200
tree	9b69f7066f2987ab6dca4264d8fb96903cc588dd
parent	7501ca0ecfa651f0add20fc8e8d959bb35c1fc54 [diff]
parent	74ed7f349be99f68d36402fa94480c56a447467a [diff]