Added term escaping
diff --git a/Makefile.PL b/Makefile.PL
index 97fdeac..c683c0b 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -7,10 +7,10 @@
WriteMakefile(
NAME => 'KorAP::Indexer',
AUTHOR => 'Nils Diewald',
- ABSTRACT => 'Perl Implementation for Generating Multifoundry Lucene Indices',
+ ABSTRACT => 'Preprocessor for Krill Index preparation',
VERSION_FROM => 'lib/KorAP/Indexer.pm',
PREREQ_PM => {
- 'Mojolicious' => 4.51,
+ 'Mojolicious' => 6.11,
'Packed::Array' => 0.01,
'Log::Log4perl' => 1.42,
'JSON::XS' => 3.01,
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 96f5ce5..b9bdfe3 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -502,7 +502,12 @@
local $SIG{__WARN__} = sub {
$error = 1;
};
- $meta = xml2hash($file, text => '#text', attr => '-', array => ['h.title', 'imprint', 'catRef', 'h.author'])->{idsHeader};
+ $meta = xml2hash(
+ $file,
+ text => '#text',
+ attr => '-',
+ array => ['h.title', 'imprint', 'catRef', 'h.author']
+ )->{idsHeader};
}
catch {
$self->log->warn($unable);
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 0f93787..4b20027 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -45,7 +45,7 @@
if (defined $_[1]) {
return $_[0]->[1] = $_[1];
};
- $_[0]->[1];
+ $_[0]->[1] // 0;
};
# 2
@@ -53,7 +53,7 @@
if (defined $_[1]) {
return $_[0]->[2] = $_[1];
};
- $_[0]->[2];
+ $_[0]->[2] // 0;
};
# 3
@@ -61,7 +61,7 @@
if (defined $_[1]) {
return $_[0]->[3] = $_[1];
};
- $_[0]->[3];
+ $_[0]->[3] // 0;
};
# 4
@@ -69,7 +69,7 @@
if (defined $_[1]) {
return $_[0]->[4] = $_[1];
};
- $_[0]->[4];
+ $_[0]->[4] // 0;
};
# 5
@@ -77,7 +77,7 @@
if (defined $_[1]) {
return $_[0]->[5] = $_[1];
};
- $_[0]->[5];
+ $_[0]->[5] // '';
};
# 6
@@ -91,7 +91,8 @@
# to string based on array
sub to_string {
- my $string = $_[0]->[5];
+ my $string = _escape_term($_[0]->[5]);
+
if (defined $_[0]->[3]) {
$string .= '#' .$_[0]->[3] .'-' . $_[0]->[4];
};
@@ -142,7 +143,11 @@
return $string;
};
-
+sub _escape_term ($) {
+ my $str = shift;
+ $str =~ s/([\#\$\\])/\\$1/g;
+ return $str;
+};
sub to_solr {
diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index db95729..2d7d7ca 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm
@@ -16,7 +16,7 @@
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>2'
+ payload => '<b>0' # Could be 2 as well for t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index cacc2b0..5dad896 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm
@@ -16,7 +16,7 @@
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>2'
+ payload => '<b>0' # Could also be 2 for t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index f840d1e..8dc7010 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -6,7 +6,7 @@
my $self = shift;
# TODO: Create XIP tree here - for indirect dependency
- # >>:xip_d:SUBJ<i>566<i>789
+ # >>:xip/d:SUBJ<i>566<i>789
$$self->add_tokendata(
foundry => 'mate',
@@ -53,7 +53,7 @@
};
sub layer_info {
- ['mate/d=dep']
+ ['mate/d=rels']
};
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index f4e84e9..1565561 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -16,7 +16,7 @@
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>2' # t/p/s
+ payload => '<b>0' # t/p/s -> could be 2 as well
);
$i++;
}
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index d57978b..ce9c40b 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -5,7 +5,6 @@
my $self = shift;
# Phrase depencies are currently ignored.
- #
$$self->add_tokendata(
foundry => 'xip',
@@ -15,6 +14,8 @@
my ($stream, $token, $tokens) = @_;
my $mtt = $stream->pos($token->pos);
+warn $tokens;
+
my $content = $token->hash;
my $rel = $content->{rel};
@@ -60,7 +61,7 @@
};
sub layer_info {
- ['xip/d=dep']
+ ['xip/d=rels']
}
diff --git a/lib/KorAP/Index/XIP/Sentences.pm b/lib/KorAP/Index/XIP/Sentences.pm
index 35ab6f1..9d61825 100644
--- a/lib/KorAP/Index/XIP/Sentences.pm
+++ b/lib/KorAP/Index/XIP/Sentences.pm
@@ -19,7 +19,7 @@
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>2'
+ payload => '<b>0' # Could be 2 as well for t/p/s
);
$i++;
}
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 7cecc2b..a75242c 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -1,7 +1,6 @@
package KorAP::Tokenizer;
use Mojo::Base -base;
use Mojo::ByteStream 'b';
-use Mojo::Loader;
use XML::Fast;
use Try::Tiny;
use Carp qw/croak/;
@@ -158,7 +157,11 @@
$old = $to + 1;
# Add position term
- $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
+ $mtt->add(
+ term => '_' . $have,
+ o_start => $mtt->o_start,
+ o_end => $mtt->o_end
+ );
$have++;
};
@@ -343,7 +346,6 @@
sub add {
my $self = shift;
- my $loader = Mojo::Loader->new;
my $foundry = shift;
my $layer = shift;
diff --git a/script/create_example.pl b/script/create_example.pl
index a532bcf..289b946 100755
--- a/script/create_example.pl
+++ b/script/create_example.pl
@@ -16,11 +16,13 @@
02439
05663-unbalanced
07452-deep/) {
- my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $dir . '/../' . $file . '.json';
- print 'Create ' . $file . ".json\n";
- system($call);
+ my $out = $dir . '/../' . $file . '.json';
- print 'Create ' . $file . ".json.gz\n";
- $call .= '.gz -z';
- system($call);
+ my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
+ print 'Create ' . $out . "\n";
+ system($call);
+
+ print 'Create ' . $out . ".gz\n";
+ $call .= '.gz -z';
+ system($call);
};
diff --git a/t/artificial.t b/t/artificial.t
index 10d4429..92ebb8a 100644
--- a/t/artificial.t
+++ b/t/artificial.t
@@ -16,7 +16,7 @@
my $path = catdir(dirname(__FILE__), 'artificial');
ok(my $doc = KorAP::Document->new( path => $path . '/' ), 'Load Korap::Document');
-is($doc->path, $path . '/', 'Path');
+like($doc->path, qr!$path/$!, 'Path');
ok($doc->parse, 'Parse document');
sub new_tokenizer {
@@ -105,7 +105,11 @@
# Add OpenNLP/sentences
ok($tokens->add('OpenNLP', 'Sentences'), 'Add OpenNLP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]', 'Correct sentence');
+is($tokens->stream->pos(0)->to_string,
+ '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|<>:opennlp/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|opennlp/p:APPRART|s:Zum]',
+# '[(0-3)-:opennlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|opennlp/p:APPRART|<>:opennlp/s:s#0-129$<i>17]',
+ 'Correct sentence'
+ );
# New instantiation
ok($tokens = KorAP::Tokenizer->new(
@@ -125,10 +129,10 @@
ok($tokens->add('Base', 'Paragraphs'), 'Add Base/Paragraphs');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>0]',
+ '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>2|_0#0-3|i:zum|s:Zum]',
+# '[(0-3)-:base/paragraphs$<i>0|-:base/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:base/s:t#0-129$<i>17<b>0|<>:base/s:s#0-129$<i>17<b>0]',
'Correct base annotation');
-
# New instantiation
ok($tokens = new_tokenizer->parse, 'Parse');
@@ -148,7 +152,8 @@
ok($tokens->add('CoreNLP', 'Morpho'), 'Add CoreNLP/Morpho');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART]',
+ '[(0-3)-:tokens$<i>18|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]',
+# '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART]',
'Correct corenlp annotation');
$i = 0;
@@ -158,13 +163,17 @@
'Annotation (CoreNLP/p) is correct: '. $_);
};
+
+
# Add CoreNLP/Sentences
ok($tokens->add('CoreNLP', 'Sentences'), 'Add CoreNLP/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
+ '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|<>:corenlp/s:s#0-129$<i>17<b>0|_0#0-3|corenlp/p:APPRART|i:zum|s:Zum]',
+# '[(0-3)-:corenlp/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|corenlp/p:APPRART|<>:corenlp/s:s#0-129$<i>17]',
'Correct corenlp annotation');
+
# New instantiation
ok($tokens = new_tokenizer->parse, 'New Tokenizer');
@@ -172,7 +181,8 @@
ok($tokens->add('Connexor', 'Sentences'), 'Add Connexor/Sentences');
is($tokens->stream->pos(0)->to_string,
- '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
+ '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|<>:cnx/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]',
+ # '[(0-3)-:cnx/sentences$<i>1|-:tokens$<i>18|_0#0-3|i:zum|s:Zum|<>:cnx/s:s#0-129$<i>17<b>0]',
'Correct cnx annotation');
# New instantiation
@@ -213,10 +223,10 @@
# Add Connexor/Phrase
ok($tokens->add('Connexor', 'Phrase'), 'Add Connexor/Phrase');
my $stream = $tokens->stream;
-like($stream->pos(1)->to_string, qr!\|<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(6)->to_string, qr!\|<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(8)->to_string, qr!\|<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
-like($stream->pos(13)->to_string, qr!\|<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(1)->to_string, qr!<>:cnx/c:np#4-30\$<i>4<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(6)->to_string, qr!<>:cnx/c:np#40-47\$<i>7<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(8)->to_string, qr!<>:cnx/c:np#52-73\$<i>10<b>0!, 'Annotation (Connexor/c) is correct');
+like($stream->pos(13)->to_string, qr!<>:cnx/c:np#89-111\$<i>16<b>0!, 'Annotation (Connexor/c) is correct');
# New instantiation
ok($tokens = new_tokenizer->parse, 'New Tokenizer');
@@ -243,7 +253,11 @@
# Add XIP/Sentences
ok($tokens->add('XIP', 'Sentences'), 'Add XIP/Sentences');
-is($tokens->stream->pos(0)->to_string, '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]', 'First sentence');
+is($tokens->stream->pos(0)->to_string,
+ '[(0-3)-:tokens$<i>18|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0|_0#0-3|i:zum|s:Zum]',
+# '[(0-3)-:tokens$<i>18|_0#0-3|i:zum|s:Zum|-:xip/sentences$<i>1|<>:xip/s:s#0-129$<i>17<b>0]',
+ 'First sentence'
+ );
# Add XIP/Morpho
ok($tokens->add('XIP', 'Morpho'), 'Add XIP/Morpho');
@@ -261,14 +275,14 @@
};
$i = 0;
-foreach ('zu', 'letzt', 'kulturell', 'Anlass', '=laden:laden', 'die', 'Leitung', 'der', '#schulen:#Heim:schulen#Heim', 'Hofbergli', 'ein', 'bevor', 'der', 'Betrieb', 'Ende', '#schulen:#Jahr:schulen#Jahr') {
+foreach ('zu', 'letzt', 'kulturell', 'Anlass', '=laden:laden', 'die', 'Leitung', 'der', '\#schulen:\#Heim:schulen\#Heim', 'Hofbergli', 'ein', 'bevor', 'der', 'Betrieb', 'Ende', '\#schulen:\#Jahr:schulen\#Jahr') {
if ($_ eq '!') {
$i++;
next;
};
foreach my $f (split(':', $_)) {
like($tokens->stream->pos($i)->to_string,
- qr!\|xip/l:$f!,
+ qr!\|xip\/l:\Q$f\E!,
'Annotation (xip/l) is correct: ' . $f);
};
$i++;
@@ -280,6 +294,7 @@
# Add XIP/Sentences
ok($tokens->add('XIP', 'Dependency'), 'Add XIP/Dependency');
+
$stream = $tokens->stream;
like($stream->pos(1)->to_string, qr!\|>:xip/d:NMOD\$<i>3!, 'Dependency fine');
like($stream->pos(3)->to_string, qr!\|<:xip/d:NMOD\$<i>1!, 'Dependency fine');
diff --git a/t/real_goethe.t b/t/real_goethe.t
index 867542d..878607b 100644
--- a/t/real_goethe.t
+++ b/t/real_goethe.t
@@ -250,7 +250,7 @@
like($output->{data}->{foundries}, qr!connexor/sentences!, 'Foundries');
like($output->{data}->{layerInfos}, qr!cnx/s=spans!, 'layerInfos');
$first_token = join('||', @{$output->{data}->{stream}->[0]});
-like($first_token, qr!<>:cnx/s:s#0-179\$<i>21<b>2!, 'data');
+like($first_token, qr!<>:cnx/s:s#0-179\$<i>21<b>0!, 'data');
$tokens->add('Connexor', 'Morpho');
$output = decode_json( $tokens->to_json );
diff --git a/t/sort_tokens.t b/t/sort_tokens.t
index d698cac..3ecd5b0 100644
--- a/t/sort_tokens.t
+++ b/t/sort_tokens.t
@@ -9,7 +9,7 @@
use_ok('KorAP::Field::MultiTermTokenStream');
ok(my $mtt = KorAP::Field::MultiTermToken->new, 'New token');
-ok($mtt->o_start(0), 'Set start character offset');
+ok(defined $mtt->o_start(0), 'Set start character offset');
ok($mtt->o_end(5), 'Set end character offset');
ok($mtt->add(term => '@:k=N',
payload =>'<s>9'), 'Add token');
@@ -47,7 +47,7 @@
is($mtt->to_string,'[(0-5)<>:b=N#0-5$<i>5|<>:f=N#0-5$<i>6<b>5<b>122|<>:e=ADJ#0-5$<i>6<b>6|<>:d=N#0-5$<i>6<b>7|@:i=N$<s>3|@:h=N$<s>5|@:j=N$<s>8|@:k=N$<s>9|a=N$<b>144|c=N$<b>144|g=N$<b>144]', 'Check string');
ok($mtt = KorAP::Field::MultiTermToken->new, 'New token');
-ok($mtt->o_start(0), 'Set start character offset');
+ok(defined $mtt->o_start(0), 'Set start character offset');
ok($mtt->o_end(5), 'Set end character offset');
# 2-7 to 2-4