Merge changes Id3fbb94a,Ib43733cf,I992fe374
* changes:
Zip data.xml before tokens.xml
Do not escape double quoutes inside raw_text elements
Add -tk option to use the standard KoAP tokenizer
diff --git a/Makefile.PL b/Makefile.PL
index 918ef02..25eae0a 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -3,6 +3,9 @@
use strict;
use warnings;
use ExtUtils::MakeMaker;
+use File::ShareDir::Install;
+
+install_share dist => 'share';
WriteMakefile(
NAME => 'tei2korapxml',
@@ -21,6 +24,7 @@
'Capture::Tiny' => '0.48'
},
PREREQ_PM => {
+ 'File::Share' => '0.25',
'XML::CompactTree::XS' => '0.03',
'XML::LibXML::Reader' => '2.0201',
'IO::Compress::Zip' => '2.091',
@@ -29,3 +33,6 @@
MIN_PERL_VERSION => '5.016',
EXE_FILES => ['script/tei2korapxml']
);
+
+package MY;
+use File::ShareDir::Install 'postamble';
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 23b6625..8f1678d 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -3,16 +3,21 @@
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal);
# convert '&', '<' and '>' into their corresponding sgml-entities
-my %ent = (
- '"' => '"',
+my %ent_without_quot = (
'&' => '&',
'<' => '<',
'>' => '>'
);
+my %ent = (
+ %ent_without_quot,
+ '"' => '"'
+);
+
+
# remove xml comments
sub remove_xml_comments {
my ($fh, $html) = @_;
@@ -71,4 +76,12 @@
};
+# Escape
+sub escape_xml_minimal {
+ my $data = shift // '';
+ $data =~ s/([&<>])/$ent_without_quot{$1}/ge;
+ return $data;
+};
+
+
1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 10a4a41..9a09ec7 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -5,6 +5,7 @@
use Log::Any qw($log);
use IO::Select;
use IPC::Open2 qw(open2);
+use Encode qw(encode);
# This tokenizer starts an external process for
# tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
sub new {
my ($class, $cmd, $sep) = @_;
- # e.g. 'java -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
- # " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
unless ($cmd) {
$log->warn('Tokenizer not established');
return;
@@ -54,7 +52,7 @@
my ($self, $txt) = @_;
return unless $self->{pid};
my $out = $self->{chld_in};
- print $out $txt . $self->{sep};
+ print $out encode( "UTF-8", $txt ) . $self->{sep};
return $self;
};
@@ -128,8 +126,9 @@
if (defined $_ && $_ ne '') {
- # This warning is sometimes thrown, though not yet replicated in the test suite.
- # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+ # This warning is sometimes thrown, though not yet replicated
+ # in the test suite. See the discussion in gerrit (3123:
+ # Establish tokenizer object for external base tokenization)
# for further issues.
$log->warn("Extra output: $_");
}
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+ WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+ warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+ return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+ 'tei2korapxml',
+ 'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+ my $class = shift;
+ my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+ $self->{name} = 'korap';
+ $self->{sep} = "\x{04}\n";
+ return bless $self, $class;
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 96cddcf..8066b4c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -20,7 +20,7 @@
unshift @INC, "$FindBin::Bin/../lib";
};
-use KorAP::XML::TEI qw!remove_xml_comments escape_xml!;
+use KorAP::XML::TEI qw!remove_xml_comments escape_xml escape_xml_minimal!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
@@ -28,6 +28,10 @@
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
+eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+};
our $VERSION = '0.01';
@@ -39,6 +43,7 @@
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+ 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
@@ -76,13 +81,18 @@
#
## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
+my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
+
# TODO:
# Read tokenizer call from configuration file.
# was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+ }
+
+ elsif ($tokenizer_korap) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
};
my $_tok_file_ext = "tokens.xml";
##
@@ -318,6 +328,20 @@
#
+ # Encode and escape data
+ my $escaped_data = escape_xml_minimal(encode( "UTF-8", $data ));
+ # note: the index still refers to the 'single character'-versions,
+ # which are counted as 1 (search for '&' in data.xml and see
+ # corresponding indices in $_tokens_file)
+
+ if ($_DEBUG) {
+ $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+ };
+
+ $zipper->new_stream("$dir/$_data_file")
+ ->print("$data_prfx1$text_id_esc$data_prfx2$escaped_data$data_sfx");
+
+
# ~ tokenization ~
if ( $_GEN_TOK_EXT ){
@@ -346,19 +370,6 @@
$cons_tok->reset;
};
- # Encode and escape data
- $data = escape_xml(encode( "UTF-8", $data ));
- # note: the index still refers to the 'single character'-versions,
- # which are counted as 1 (search for '&' in data.xml and see
- # corresponding indices in $_tokens_file)
-
- if ($_DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
- };
-
- $zipper->new_stream("$dir/$_data_file")
- ->print("$data_prfx1$text_id_esc$data_prfx2$data$data_sfx");
-
# ~ write structures ~
if (!$structures->empty) {
$structures->to_zip(
@@ -885,6 +896,10 @@
Call an external tokenizer process, that will tokenize
a single line from STDIN and outputs one token per line.
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
=item B<--use-intern-tokenization|-ti>
Tokenize the data using two embedded tokenizers,
diff --git a/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
new file mode 100644
index 0000000..748b341
--- /dev/null
+++ b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
Binary files differ
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index bf948e8..4e8b2d0 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -2,6 +2,7 @@
use strict;
use warnings;
use FindBin;
+use Encode;
BEGIN {
unshift @INC, "$FindBin::Bin/../../lib";
};
@@ -14,7 +15,7 @@
# Read lines from input and return boundaries
while (!eof(STDIN)) {
- my $line = <>;
+ my $line = decode_utf8(<>);
for my $text (split(/\n?\x{04}\n?/, $line)) {
$tok->tokenize($text);
print join(' ', $tok->boundaries), "\n";
diff --git a/t/script.t b/t/script.t
index 4254937..9521d73 100644
--- a/t/script.t
+++ b/t/script.t
@@ -66,7 +66,8 @@
# Uncompress GOE/AGA/00000/data.xml from zip file
$t->unzip_xml('GOE/AGA/00000/data.xml')
->attr_is('raw_text', 'docid', 'GOE_AGA.00000', 'text id')
- ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content');
+ ->text_like('raw_text > text', qr!^Campagne in Frankreich 1792.*?uns allein begl.*cke\.$!, 'text content')
+ ->text_like('raw_text > text', qr!unter dem Titel "Kriegstheater"!, 'text content');
$t->unzip_xml('GOE/AGA/00000/struct/structure.xml')
->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
diff --git a/t/tei.t b/t/tei.t
index 94f7577..69b4ee1 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -9,7 +9,7 @@
use Test::KorAP::XML::TEI qw!korap_tempfile test_tei2korapxml!;
-use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml');
+use_ok('KorAP::XML::TEI', 'remove_xml_comments', 'escape_xml', 'escape_xml_minimal');
subtest 'remove_xml_comments' => sub {
my ($fh, $filename) = korap_tempfile('tei');
@@ -87,5 +87,31 @@
);
};
+subtest 'escape_xml_minimal' => sub {
+ is(
+ escape_xml_minimal('"""'),
+ '"""'
+ );
+
+ is(
+ escape_xml_minimal('&&&'),
+ '&&&'
+ );
+
+ is(
+ escape_xml_minimal('<<<'),
+ '<<<'
+ );
+
+ is(
+ escape_xml_minimal('>>>'),
+ '>>>'
+ );
+
+ is(
+ escape_xml_minimal('<tag att1="foo" att2="bar">C&A</tag>'),
+ '<tag att1="foo" att2="bar">C&A</tag>'
+ );
+};
done_testing;
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
new file mode 100644
index 0000000..809dd45
--- /dev/null
+++ b/t/tokenization-korap.t
@@ -0,0 +1,66 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use Test::XML::Loy;
+
+use FindBin;
+use utf8;
+
+BEGIN {
+ eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+ } or do {
+ plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+ };
+}
+
+require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+
+$ext->tokenize("Der alte Mann");
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
+$ext->reset;
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+my $string = "Pluto.\" Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
+$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
+$t->element_count_is('layer spanList span', 14);
+done_testing;