Made the indexer more robust and ignore s**t my parser says
diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index 8566896..0e9d2f3 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm
@@ -29,12 +29,13 @@
return $self->[0] unless $to;
my $substr = substr($self->[0], $from, $to - $from);
- if ($substr) {
+ if (defined $substr) {
# return b($substr)->encode;
return $substr;
};
+
# encode 'UTF-8',
- carp 'Unable to find substring';
+ # carp 'Unable to find substring';
return;
};
@@ -51,12 +52,14 @@
return b($self->[0])->decode unless $to;
my $substr = substr($self->[0], $from, $to - $from);
- if ($substr) {
+ if (defined $substr) {
# return b($substr)->encode;
return b($substr)->decode;
};
+
# encode 'UTF-8',
- carp 'Unable to find substring';
+ # carp 'Unable to find substring';
+ return;
};
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index b60e724..3b83b5f 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -49,7 +49,7 @@
my $to = $span->attr('to');
my $token = $doc->primary->data($from, $to);
- unless ($token) {
+ unless (defined $token) {
$self->log->error("Unable to find substring [$from-$to] in $path");
return;
};
@@ -138,7 +138,7 @@
%param
);
- my $spanarray = $spans->parse;
+ my $spanarray = $spans->parse or return;
if ($spans->should == $spans->have) {
$self->log->trace('With perfect alignment!');
@@ -180,7 +180,7 @@
%param
);
- my $tokenarray = $tokens->parse;
+ my $tokenarray = $tokens->parse or return;
if ($tokens->should == $tokens->have) {
$self->log->trace('With perfect alignment!');
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 1b69624..7574743 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -22,22 +22,20 @@
# my $spans = XML::LibXML->load_xml(string => $file);
- my $spans;
-
+ my ($spans, $error);
try {
local $SIG{__WARN__} = sub {
- my $msg = shift;
- $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+ $error = 1;
};
-
$spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
-
}
catch {
- $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
- return [];
+ $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $error = 1;
};
+ return if $error;
+
if (ref $spans && $spans->{span}) {
$spans = $spans->{span};
}
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index 3e11d86..b62d2bb 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -19,20 +19,20 @@
# my $spans = Mojo::DOM->new($file);
# $spans->xml(1);
- my $spans;
+ my ($spans, $error);
try {
local $SIG{__WARN__} = sub {
- my $msg = shift;
- $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+ $error = 1;
};
-
$spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
}
catch {
- $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
- return [];
+ $self->log->warn('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $error = 1;
};
+ return if $error;
+
if (ref $spans && $spans->{span}) {
$spans = $spans->{span};
}
diff --git a/script/create_example.pl b/script/create_example.pl
index 275866e..2b69a4f 100755
--- a/script/create_example.pl
+++ b/script/create_example.pl
@@ -6,7 +6,7 @@
my $dir = $FindBin::Bin;
-foreach my $file (qw/00001 00002 00003 00004 00005 00006 02439/) {
+foreach my $file (qw/00001 00002 00003 00004 00005 00006 02035-substring 02439 05663-unbalanced 07452-deep/) {
my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $dir . '/../' . $file . '.json';
print 'Create ' . $file . ".json\n";
system($call);