Added term escaping

commit: 6a2a14b6ba05407874202b9c37099a42b18bbb2a [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Wed Jun 17 20:34:24 2015 +0000
committer: Nils Diewald <nils@diewald-online.de> Wed Jun 17 20:34:24 2015 +0000
tree: 8453f1aa6a5c09a97598cb68847849e4d253a9ae
parent: 5579179da9281a7d36230b97f569a1970536d06d [diff]
diff --git a/Makefile.PL b/Makefile.PL
index 97fdeac..c683c0b 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -7,10 +7,10 @@
 WriteMakefile(
   NAME         => 'KorAP::Indexer',
   AUTHOR       => 'Nils Diewald',
-  ABSTRACT     => 'Perl Implementation for Generating Multifoundry Lucene Indices',
+  ABSTRACT     => 'Preprocessor for Krill Index preparation',
   VERSION_FROM => 'lib/KorAP/Indexer.pm',
   PREREQ_PM => {
-    'Mojolicious'    => 4.51,
+    'Mojolicious'    => 6.11,
     'Packed::Array'  => 0.01,
     'Log::Log4perl'  => 1.42,
     'JSON::XS'       => 3.01,

diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 96f5ce5..b9bdfe3 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm

@@ -502,7 +502,12 @@
       local $SIG{__WARN__} = sub {
 	  $error = 1;
       };
-      $meta = xml2hash($file, text => '#text', attr => '-', array => ['h.title', 'imprint', 'catRef', 'h.author'])->{idsHeader};
+      $meta = xml2hash(
+	$file,
+	text => '#text',
+	attr => '-',
+	array => ['h.title', 'imprint', 'catRef', 'h.author']
+      )->{idsHeader};
   }
   catch  {
       $self->log->warn($unable);

diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 0f93787..4b20027 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm

@@ -45,7 +45,7 @@
   if (defined $_[1]) {
     return $_[0]->[1] = $_[1];
   };
-  $_[0]->[1];
+  $_[0]->[1] // 0;
 };
 
 # 2
@@ -53,7 +53,7 @@
   if (defined $_[1]) {
     return $_[0]->[2] = $_[1];
   };
-  $_[0]->[2];
+  $_[0]->[2] // 0;
 };
 
 # 3
@@ -61,7 +61,7 @@
   if (defined $_[1]) {
     return $_[0]->[3] = $_[1];
   };
-  $_[0]->[3];
+  $_[0]->[3] // 0;
 };
 
 # 4
@@ -69,7 +69,7 @@
   if (defined $_[1]) {
     return $_[0]->[4] = $_[1];
   };
-  $_[0]->[4];
+  $_[0]->[4] // 0;
 };
 
 # 5
@@ -77,7 +77,7 @@
   if (defined $_[1]) {
     return $_[0]->[5] = $_[1];
   };
-  $_[0]->[5];
+  $_[0]->[5] // '';
 };
 
 # 6
@@ -91,7 +91,8 @@
 
 # to string based on array
 sub to_string {
-  my $string = $_[0]->[5];
+  my $string = _escape_term($_[0]->[5]);
+
   if (defined $_[0]->[3]) {
     $string .= '#' .$_[0]->[3] .'-' . $_[0]->[4];
   };
@@ -142,7 +143,11 @@
   return $string;
 };
 
-
+sub _escape_term ($) {
+  my $str = shift;
+  $str =~ s/([\#\$\\])/\\$1/g;
+  return $str;
+};
 
 
 sub to_solr {

diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index db95729..2d7d7ca 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm

@@ -16,7 +16,7 @@
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end,
-	payload => '<b>2'
+	payload => '<b>0' # Could be 2 as well for t/p/s
       );
       $i++;
     }

diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index cacc2b0..5dad896 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm

@@ -16,7 +16,7 @@
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end,
-	payload => '<b>2'
+	payload => '<b>0' # Could also be 2 for t/p/s
       );
       $i++;
     }

diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index f840d1e..8dc7010 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm

@@ -6,7 +6,7 @@
   my $self = shift;
 
   # TODO: Create XIP tree here - for indirect dependency
-  # >>:xip_d:SUBJ<i>566<i>789
+  # >>:xip/d:SUBJ<i>566<i>789
 
   $$self->add_tokendata(
     foundry => 'mate',
@@ -53,7 +53,7 @@
 };
 
 sub layer_info {
-    ['mate/d=dep']
+  ['mate/d=rels']
 };
 
 

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index f4e84e9..1565561 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -16,7 +16,7 @@
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end,
-	payload => '<b>2' # t/p/s
+	payload => '<b>0' # t/p/s -> could be 2 as well
       );
       $i++;
     }

diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index d57978b..ce9c40b 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm

@@ -5,7 +5,6 @@
   my $self = shift;
 
   # Phrase depencies are currently ignored.
-  #
 
   $$self->add_tokendata(
     foundry => 'xip',
@@ -15,6 +14,8 @@
       my ($stream, $token, $tokens) = @_;
       my $mtt = $stream->pos($token->pos);
 
+warn $tokens;
+
       my $content = $token->hash;
 
       my $rel = $content->{rel};
@@ -60,7 +61,7 @@
 };
 
 sub layer_info {
-    ['xip/d=dep']
+  ['xip/d=rels']
 }
 
 

diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
index 35ab6f1..9d61825 100644
--- a/lib/KorAP/Index/XIP/Sentences.pm
+++ b/lib/KorAP/Index/XIP/Sentences.pm

@@ -19,7 +19,7 @@
 	o_start => $span->o_start,
 	o_end => $span->o_end,
 	p_end => $span->p_end,
-	payload => '<b>2'
+	payload => '<b>0' # Could be 2 as well for t/p/s
       );
       $i++;
     }

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 7cecc2b..a75242c 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -1,7 +1,6 @@
 package KorAP::Tokenizer;
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
-use Mojo::Loader;
 use XML::Fast;
 use Try::Tiny;
 use Carp qw/croak/;
@@ -158,7 +157,11 @@
       $old = $to + 1;
 
       # Add position term
-      $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
+      $mtt->add(
+	term => '_' . $have,
+	o_start => $mtt->o_start,
+	o_end => $mtt->o_end
+      );
 
       $have++;
   };
@@ -343,7 +346,6 @@
 
 sub add {
   my $self = shift;
-  my $loader = Mojo::Loader->new;
   my $foundry = shift;
   my $layer = shift;
 

diff --git a/script/create_example.pl b/script/create_example.pl
index a532bcf..289b946 100755
--- a/script/create_example.pl
+++ b/script/create_example.pl

@@ -16,11 +16,13 @@
 		     02439
 		     05663-unbalanced
 		     07452-deep/) {
-    my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $dir . '/../' . $file . '.json';
-    print 'Create ' . $file . ".json\n";
-    system($call);
+  my $out = $dir . '/../' . $file . '.json';
 
-    print 'Create ' . $file . ".json.gz\n";
-    $call .= '.gz -z';
-    system($call);
+  my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
+  print 'Create ' . $out . "\n";
+  system($call);
+
+  print 'Create ' . $out . ".gz\n";
+  $call .= '.gz -z';
+  system($call);
 };

diff --git a/t/artificial.t b/t/artificial.t
index 10d4429..92ebb8a 100644
--- a/t/artificial.t
+++ b/t/artificial.t

@@ -16,7 +16,7 @@
 
 my $path = catdir(dirname(__FILE__), 'artificial');
 ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
 ok($doc->parse, 'Parse document');
 
 sub new_tokenizer {
@@ -105,7 +105,11 @@
 # Add OpenNLP/sentences
 ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string,
+   '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|<>:opennlp/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|opennlp/p:APPRART|s:Zum]',
+#   '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]',
+   'Correct sentence'
+ );
 
 # New instantiation
 ok($tokens = KorAP::Tokenizer->new(
@@ -125,10 +129,10 @@
 ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>0]',
+   '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>2|_0#0-3|i:zum|s:Zum]',
+#   '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>0]',
    'Correct base annotation');
 
-
 # New instantiation
 ok($tokens = new_tokenizer->parse, 'Parse');
 
@@ -148,7 +152,8 @@
 ok($tokens->add('CoreNLP', 'Morpho'), 'Add CoreNLP/Morpho');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART]',
+   '[(0-3)-:tokens$<i>18|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]',
+#   '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART]',
    'Correct corenlp annotation');
 
 $i = 0;
@@ -158,13 +163,17 @@
        'Annotation (CoreNLP/p) is correct: '. $_);
 };
 
+
+
 # Add CoreNLP/Sentences
 ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
+   '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|<>:corenlp/s:s#0-129$<i>17<b>0|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]',
+#   '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
    'Correct corenlp annotation');
 
+
 # New instantiation
 ok($tokens = new_tokenizer->parse, 'New Tokenizer');
 
@@ -172,7 +181,8 @@
 ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
 
 is($tokens->stream->pos(0)->to_string,
-   '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
+   '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]',
+   #   '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
    'Correct cnx annotation');
 
 # New instantiation
@@ -213,10 +223,10 @@
 # Add Connexor/Phrase
 ok($tokens->add('Connexor', 'Phrase'), 'Add Connexor/Phrase');
 my $stream = $tokens->stream;
-like($stream->pos(1)->to_string, qr!\|<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(6)->to_string, qr!\|<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(8)->to_string, qr!\|<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(13)->to_string, qr!\|<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(1)->to_string, qr!<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(6)->to_string, qr!<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(8)->to_string, qr!<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(13)->to_string, qr!<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
 
 # New instantiation
 ok($tokens = new_tokenizer->parse, 'New Tokenizer');
@@ -243,7 +253,11 @@
 # Add XIP/Sentences
 ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
 
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]', 'First sentence');
+is($tokens->stream->pos(0)->to_string,
+   '[(0-3)-:tokens$<i>18|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]',
+#   '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]',
+   'First sentence'
+ );
 
 # Add XIP/Morpho
 ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
@@ -261,14 +275,14 @@
 };
 
 $i = 0;
-foreach ('zu', 'letzt', 'kulturell', 'Anlass', '=laden:laden', 'die', 'Leitung', 'der', '#schulen:#Heim:schulen#Heim', 'Hofbergli', 'ein', 'bevor', 'der', 'Betrieb', 'Ende', '#schulen:#Jahr:schulen#Jahr') {
+foreach ('zu', 'letzt', 'kulturell', 'Anlass', '=laden:laden', 'die', 'Leitung', 'der', '\#schulen:\#Heim:schulen\#Heim', 'Hofbergli', 'ein', 'bevor', 'der', 'Betrieb', 'Ende', '\#schulen:\#Jahr:schulen\#Jahr') {
   if ($_ eq '!') {
     $i++;
     next;
   };
   foreach my $f (split(':', $_)) {
     like($tokens->stream->pos($i)->to_string,
-	 qr!\|xip/l:$f!,
+	 qr!\|xip\/l:\Q$f\E!,
 	 'Annotation (xip/l) is correct: ' . $f);
   };
   $i++;
@@ -280,6 +294,7 @@
 # Add XIP/Sentences
 ok($tokens->add('XIP', 'Dependency'), 'Add XIP/Dependency');
 
+
 $stream = $tokens->stream;
 like($stream->pos(1)->to_string, qr!\|>:xip/d:NMOD\$<i>3!, 'Dependency fine');
 like($stream->pos(3)->to_string, qr!\|<:xip/d:NMOD\$<i>1!, 'Dependency fine');

diff --git a/t/real_goethe.t b/t/real_goethe.t
index 867542d..878607b 100644
--- a/t/real_goethe.t
+++ b/t/real_goethe.t

@@ -250,7 +250,7 @@
 like($output->{data}->{foundries}, qr!connexor/sentences!, 'Foundries');
 like($output->{data}->{layerInfos}, qr!cnx/s=spans!, 'layerInfos');
 $first_token = join('||', @{$output->{data}->{stream}->[0]});
-like($first_token, qr!<>:cnx/s:s#0-179\$<i>21<b>2!, 'data');
+like($first_token, qr!<>:cnx/s:s#0-179\$<i>21<b>0!, 'data');
 
 $tokens->add('Connexor', 'Morpho');
 $output = decode_json( $tokens->to_json );

diff --git a/t/sort_tokens.t b/t/sort_tokens.t
index d698cac..3ecd5b0 100644
--- a/t/sort_tokens.t
+++ b/t/sort_tokens.t

@@ -9,7 +9,7 @@
 use_ok('KorAP::Field::MultiTermTokenStream');
 
 ok(my $mtt = KorAP::Field::MultiTermToken->new, 'New token');
-ok($mtt->o_start(0), 'Set start character offset');
+ok(defined $mtt->o_start(0), 'Set start character offset');
 ok($mtt->o_end(5), 'Set end character offset');
 ok($mtt->add(term => '@:k=N',
 	     payload =>'<s>9'), 'Add token');
@@ -47,7 +47,7 @@
 is($mtt->to_string,'[(0-5)<>:b=N#0-5$<i>5|<>:f=N#0-5$<i>6<b>5<b>122|<>:e=ADJ#0-5$<i>6<b>6|<>:d=N#0-5$<i>6<b>7|@:i=N$<s>3|@:h=N$<s>5|@:j=N$<s>8|@:k=N$<s>9|a=N$<b>144|c=N$<b>144|g=N$<b>144]', 'Check string');
 
 ok($mtt = KorAP::Field::MultiTermToken->new, 'New token');
-ok($mtt->o_start(0), 'Set start character offset');
+ok(defined $mtt->o_start(0), 'Set start character offset');
 ok($mtt->o_end(5), 'Set end character offset');
 
 # 2-7 to 2-4
commit	6a2a14b6ba05407874202b9c37099a42b18bbb2a	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Wed Jun 17 20:34:24 2015 +0000
committer	Nils Diewald <nils@diewald-online.de>	Wed Jun 17 20:34:24 2015 +0000
tree	8453f1aa6a5c09a97598cb68847849e4d253a9ae
parent	5579179da9281a7d36230b97f569a1970536d06d [diff]