Fix dealing with no-span layers|Improve error messages for bughunting
diff --git a/Makefile.PL b/Makefile.PL
index db22f55..804bcaf 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -15,6 +15,7 @@
'Log::Log4perl' => 1.42,
'JSON::XS' => 3.01,
'Set::Scalar' => 1.26,
+ 'IO::Dir::Recursive' => 0,
'Benchmark' => 0,
'Carp' => 0,
'strict' => 0,
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 6153463..d7d3abf 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -113,6 +113,10 @@
my $day = $dom->at("pubDate[type=day]");
$day = $day ? $day->text : 0;
+ $year = 0 if $year !~ /^\d+$/;
+ $month = 0 if $month !~ /^\d+$/;
+ $day = 0 if $day !~ /^\d+$/;
+
my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
$date .= length($month) == 1 ? '0' . $month : $month;
$date .= length($day) == 1 ? '0' . $day : $day;
diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index be0a234..8566896 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm
@@ -1,7 +1,7 @@
package KorAP::Document::Primary;
use strict;
use warnings;
-use Carp qw/croak/;
+use Carp qw/croak carp/;
use Mojo::ByteStream 'b';
use feature 'state';
use Packed::Array;
@@ -34,7 +34,8 @@
return $substr;
};
# encode 'UTF-8',
- croak 'Unable to find substring';
+ carp 'Unable to find substring';
+ return;
};
@@ -55,7 +56,7 @@
return b($substr)->decode;
};
# encode 'UTF-8',
- croak 'Unable to find substring';
+ carp 'Unable to find substring';
};
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 338a748..6b45f63 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -22,7 +22,7 @@
}
) or return;
- $$self->stream->add_meta('paragraph', '<i>' . $i);
+ $$self->stream->add_meta('paragraphs', '<i>' . $i);
return 1;
};
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index d6d673e..55ecbbc 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -40,7 +40,7 @@
foreach (split '\|', $found) {
my ($x, $y) = split "=", $_;
# case, tense, number, mood, person, degree, gender
- $mtt->add(term => 'mate/m:' . $x . ':' . $y);
+ $mtt->add(term => 'mate/m:' . $x . ($y ? ':' . $y : ''));
};
};
};
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 0b774f7..3e60d9d 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -21,7 +21,7 @@
}
) or return;
- $$self->stream->add_meta('sentence', '<i>' . $i);
+ $$self->stream->add_meta('sentences', '<i>' . $i);
return 1;
};
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 3b29424..b60e724 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -24,7 +24,8 @@
# Create new token stream
my $mtts = KorAP::Field::MultiTermTokenStream->new;
- my $file = b($self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml')->slurp;
+ my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
+ my $file = b($path)->slurp;
my $tokens = Mojo::DOM->new($file);
$tokens->xml(1);
@@ -48,6 +49,11 @@
my $to = $span->attr('to');
my $token = $doc->primary->data($from, $to);
+ unless ($token) {
+ $self->log->error("Unable to find substring [$from-$to] in $path");
+ return;
+ };
+
$should++;
# Ignore non-word tokens
@@ -81,7 +87,7 @@
});
# Add token count
- $mtts->add_meta('token', '<i>' . $have);
+ $mtts->add_meta('tokens', '<i>' . $have);
$range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
index 2a06aea..bae1edf 100644
--- a/lib/KorAP/Tokenizer/Match.pm
+++ b/lib/KorAP/Tokenizer/Match.pm
@@ -13,14 +13,17 @@
};
sub lookup {
+ return undef unless $_[1] && $_[2];
$_[0]->{$_[1] . ':' . $_[2]} // undef;
};
sub startswith {
+ return undef unless $_[1];
$_[0]->{'[' . $_[1]} // undef;
};
sub endswith {
+ return undef unless $_[1];
$_[0]->{$_[1] . ']'} // undef;
};
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 5e5cec6..1b69624 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -4,19 +4,47 @@
use Mojo::DOM;
use Mojo::ByteStream 'b';
use XML::Fast;
+use Try::Tiny;
has 'range';
+has 'log' => sub {
+ Log::Log4perl->get_logger(__PACKAGE__)
+};
+
sub parse {
my $self = shift;
- my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+ my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+ my $file = b($path)->slurp;
# my $spans = Mojo::DOM->new($file);
# $spans->xml(1);
# my $spans = XML::LibXML->load_xml(string => $file);
- my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
+ my $spans;
+
+ try {
+ local $SIG{__WARN__} = sub {
+ my $msg = shift;
+ $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+ };
+
+ $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+
+ }
+ catch {
+ $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ return [];
+ };
+
+ if (ref $spans && $spans->{span}) {
+ $spans = $spans->{span};
+ }
+ else {
+ return [];
+ };
+
$spans = [$spans] if ref $spans ne 'ARRAY';
my ($should, $have) = (0,0);
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index ceb2e4c..3e11d86 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -5,17 +5,42 @@
use KorAP::Tokenizer::Token;
use Carp qw/croak carp/;
use XML::Fast;
+use Try::Tiny;
+has 'log' => sub {
+ Log::Log4perl->get_logger(__PACKAGE__)
+};
sub parse {
my $self = shift;
- my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+ my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+ my $file = b($path)->slurp;
# my $spans = Mojo::DOM->new($file);
# $spans->xml(1);
- my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
- $spans = [$spans] if ref $spans ne 'ARRAY';
+ my $spans;
+ try {
+ local $SIG{__WARN__} = sub {
+ my $msg = shift;
+ $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+ };
+
+ $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+ }
+ catch {
+ $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+ return [];
+ };
+
+ if (ref $spans && $spans->{span}) {
+ $spans = $spans->{span};
+ }
+ else {
+ return [];
+ };
+
+ $spans = [$spans] if ref $spans ne 'ARRAY';
my ($should, $have) = (0,0);
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
index 74bafa9..5deead0 100644
--- a/lib/KorAP/Tokenizer/Units.pm
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -3,7 +3,9 @@
use KorAP::Tokenizer::Token;
use Mojo::Base -base;
-has [qw/path foundry layer match range primary should have/];
+has [qw/path foundry layer match range primary/];
+has 'should' => 0;
+has 'have' => 0;
has 'encoding' => 'utf-8';
sub span {
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
index 21d8dd9..8acdc80 100644
--- a/script/prepare_index.pl
+++ b/script/prepare_index.pl
@@ -20,7 +20,7 @@
Merge foundry data based on a tokenization and create indexer friendly documents.
Call:
-prepare_index.pl -z --input <directory> --outputfile <filename>
+prepare_index.pl -z --input <directory> --output <filename>
--input|-i <directory> Directory of the document to index
--output|-o <filename> Document name for output (optional),
@@ -54,7 +54,7 @@
# Options from the command line
my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
GetOptions(
- 'input|x=s' => \$input,
+ 'input|i=s' => \$input,
'output|o=s' => \$output,
'human|m' => \$text,
'token|t=s' => \$token_base,
diff --git a/script/wrap_folders.pl b/script/wrap_folders.pl
new file mode 100644
index 0000000..979e1bd
--- /dev/null
+++ b/script/wrap_folders.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+use v5.16;
+use Getopt::Long;
+use Directory::Iterator;
+
+my $local = $FindBin::Bin;
+
+sub printhelp {
+ print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents
+for whole directories.
+
+Call:
+wrap_folders.pl -z --input <directory> --output <directory>
+
+--input|-i <directory> Directory of documents to index
+--output|-o <directory> Name of output folder
+--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
+ the name of the foundry and optionally the name
+ of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
+ or specific layers by defining the name
+ with a # in front of the foundry,
+ e.g. Mate#Morpho. Alternatively you can skip #ALL.
+ Can be set multiple times.
+--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
+ combining the foundry name with a # and the layer name.
+--primary|-p Output primary data or not. Defaults to true.
+ Can be flagged using --no-primary as well.
+--human|-m Represent the data human friendly,
+ while the output defaults to JSON
+--pretty|-y Pretty print json output
+--gzip|-z Compress the output
+ (expects a defined output file)
+--log|-l The Log4perl log level, defaults to ERROR.
+--help|-h Print this document (optional)
+
+diewald@ids-mannheim.de, 2013/11/25
+
+EOHELP
+
+ exit(defined $_[0] ? $_[0] : 0);
+};
+
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
+GetOptions(
+ 'input|i=s' => \$input,
+ 'output|o=s' => \$output,
+ 'human|m' => \$text,
+ 'token|t=s' => \$token_base,
+ 'gzip|z' => \$gzip,
+ 'skip|s=s' => \@skip,
+ 'log|l=s' => \$log_level,
+ 'allow|a=s' => \@allow,
+ 'primary|p!' => \$primary,
+ 'pretty|y' => \$pretty,
+ 'help|h' => sub { printhelp }
+);
+
+printhelp(1) if !$input || !$output;
+
+
+sub write_file {
+ my $anno = shift;
+ my $file = $anno;
+ $file =~ s/^?\/?$input//;
+ $file =~ tr/\//-/;
+ $file =~ s{^-+}{};
+
+ my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+ $call .= '.gz -z' if $gzip;
+ $call .= ' -m' if $text;
+ $call .= ' -l ' . $log_level if $log_level;
+ $call .= ' --no-primary ' if $primary;
+ $call .= ' -y ' . $pretty if $pretty;
+ $call .= ' -a ' . $_ foreach @allow;
+ $call .= ' -s ' . $_ foreach @skip;
+ system($call);
+};
+
+
+my $it = Directory::Iterator->new($input);
+my $dir;
+while (1) {
+
+ if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
+ write_file($dir);
+ $it->prune;
+ };
+ last unless $it->next;
+};
+
+
+__END__