Update scripts and sgbr test suite
Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017
diff --git a/.gitignore b/.gitignore
index 8ab84ac..832d443 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,11 +4,11 @@
benchmark
docs
todo.org
+tools
fixtures.txt
node_modules
log
blib
-script*
MYMETA*
Makefile
pm_to_blib
diff --git a/Changes b/Changes
index 5f7e5ea..7b581b8 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
-0.05 2016-01-28
+0.05 2016-02-04
- Changed KorAP::Document to KorAP::XML::Krill.
+ - Renamed "Schreibgebrauch" to "Sgbr".
- Preparation for GitHub release.
0.04 2016-01-28
diff --git a/MANIFEST b/MANIFEST
index dcba338..b37db9c 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -75,10 +75,10 @@
t/real/bzk_2.t
t/real/goethe.t
t/real/wdd.t
-t/sgbr/sgbr_lemma.t
-t/sgbr/sgbr_meta.t
-t/sgbr/sgbr_pos.t
-t/sgbr/sgbr_token.t
+t/sgbr/lemma.t
+t/sgbr/meta.t
+t/sgbr/pos.t
+t/sgbr/token.t
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
t/corpus/VDI/header.xml
diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index cf1e5c3..5019e2f 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm
@@ -198,8 +198,8 @@
# Check depth
else {
- my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
- my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
+ my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)$/ : 0);
+ my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)$/ : 0);
$a_depth //= 0;
$b_depth //= 0;
diff --git a/lib/KorAP/XML/Index/Sgbr/Lemma.pm b/lib/KorAP/XML/Index/Sgbr/Lemma.pm
index fb2fee4..81726be 100644
--- a/lib/KorAP/XML/Index/Sgbr/Lemma.pm
+++ b/lib/KorAP/XML/Index/Sgbr/Lemma.pm
@@ -1,4 +1,4 @@
-package KorAP::XML::Index::Schreibgebrauch::Lemma;
+package KorAP::XML::Index::Sgbr::Lemma;
use KorAP::XML::Index::Base;
use Mojo::ByteStream 'b';
diff --git a/lib/KorAP/XML/Index/Sgbr/Morpho.pm b/lib/KorAP/XML/Index/Sgbr/Morpho.pm
index eb49f85..fac17a7 100644
--- a/lib/KorAP/XML/Index/Sgbr/Morpho.pm
+++ b/lib/KorAP/XML/Index/Sgbr/Morpho.pm
@@ -1,4 +1,4 @@
-package KorAP::XML::Index::Schreibgebrauch::Morpho;
+package KorAP::XML::Index::Sgbr::Morpho;
use KorAP::XML::Index::Base;
sub parse {
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 47f4379..b1c6f20 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -382,11 +382,11 @@
return $retval;
}
else {
- $self->log->error('Unable to parse '.$mod);
+ $self->log->warn('Unable to parse '.$mod);
};
}
else {
- $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
+ $self->log->warn('Unable to load '.$mod . '(' . $@ . ')');
};
return;
diff --git a/script/create_example.pl b/script/create_example.pl
deleted file mode 100755
index 289b946..0000000
--- a/script/create_example.pl
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use warnings;
-use FindBin;
-use v5.16;
-
-my $dir = $FindBin::Bin;
-
-foreach my $file (qw/00001
- 00002
- 00003
- 00004
- 00005
- 00006
- 02035-substring
- 02439
- 05663-unbalanced
- 07452-deep/) {
- my $out = $dir . '/../' . $file . '.json';
-
- my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
- print 'Create ' . $out . "\n";
- system($call);
-
- print 'Create ' . $out . ".gz\n";
- $call .= '.gz -z';
- system($call);
-};
diff --git a/script/prepare_index.pl b/script/korapxml2krill
similarity index 91%
rename from script/prepare_index.pl
rename to script/korapxml2krill
index 486032a..96c3b23 100644
--- a/script/prepare_index.pl
+++ b/script/korapxml2krill
@@ -6,21 +6,26 @@
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
-use KorAP::Document;
-use KorAP::Tokenizer;
-our $VERSION = 0.03;
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
# Merges foundry data to create indexer friendly documents
# ndiewald, 2014/10/29
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
sub printhelp {
print <<'EOHELP';
Merge foundry data based on a tokenization and create indexer friendly documents.
Call:
-prepare_index.pl -z --input <directory> --output <filename>
+korapxml2krill -z --input <directory> --output <filename>
--input|-i <directory> Directory of the document to index
--output|-o <filename> Document name for output (optional),
@@ -46,7 +51,7 @@
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
-diewald@ids-mannheim.de, 2014/11/05
+diewald@ids-mannheim.de, 2016/02/04
EOHELP
exit(defined $_[0] ? $_[0] : 0);
@@ -107,11 +112,11 @@
$main::LAST_STOP = $new;
};
-# Call perl script/prepare_index.pl WPD/AAA/00001
+# Call perl script/korapxml2krill WPD/AAA/00001
# Create and parse new document
$input =~ s{([^/])$}{$1/};
-my $doc = KorAP::Document->new( path => $input );
+my $doc = KorAP::XML::Krill->new( path => $input );
unless ($doc->parse) {
$log->warn($output . " can't be processed - no document data");
@@ -124,7 +129,7 @@
};
# Get tokenization
-my $tokens = KorAP::Tokenizer->new(
+my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
@@ -161,7 +166,7 @@
push(@layers, ['Glemm', 'Morpho']);
# Malt
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
@@ -172,14 +177,13 @@
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
-push(@layers, ['Schreibgebrauch', 'Lemma']);
-push(@layers, ['Schreibgebrauch', 'Morpho']);
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
-
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
diff --git a/script/wrap_folders.pl b/script/korapxml2krill_dir
similarity index 89%
rename from script/wrap_folders.pl
rename to script/korapxml2krill_dir
index b3f8aad..7c38efc 100644
--- a/script/wrap_folders.pl
+++ b/script/korapxml2krill_dir
@@ -7,6 +7,13 @@
my $local = $FindBin::Bin;
+# Changes
+# 2013/11/25
+# - Initial release
+#
+# 2016/02/04
+# - Rename to korapxml2krill_dir
+
sub printhelp {
print <<'EOHELP';
@@ -14,7 +21,7 @@
for whole directories.
Call:
-wrap_folders.pl -z --input <directory> --output <directory>
+korapxml2krill_dir -z --input <directory> --output <directory>
--input|-i <directory> Directory of documents to index
--output|-o <directory> Name of output folder
@@ -38,7 +45,7 @@
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
-diewald@ids-mannheim.de, 2013/11/25
+diewald@ids-mannheim.de, 2016/02/04
EOHELP
@@ -70,7 +77,7 @@
$file =~ tr/\//-/;
$file =~ s{^-+}{};
- my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+ my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
$call .= ' -m' if $text;
$call .= ' -l ' . $log_level if $log_level;
@@ -78,6 +85,7 @@
$call .= ' -y ' . $pretty if $pretty;
$call .= ' -a ' . $_ foreach @allow;
$call .= ' -s ' . $_ foreach @skip;
+ print "Convert $file\n";
system($call);
};
diff --git a/script/strip_commercial_annotations b/script/strip_commercial_annotations
deleted file mode 100644
index 7039b04..0000000
--- a/script/strip_commercial_annotations
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env perl
-use Mojo::Base -strict;
-use Mojo::JSON 'j';
-use Mojo::Util qw/slurp spurt/;
-use Pod::Usage;
-
-####
-# Remove xip and cnx foundries from legacy index files
-# This needs the installation of the Mojolicious package:
-# $ cpan install Mojolicious
-####
-
-our @ARGV;
-my $COMM_FOUNDRIES = qr!(?:xip|cnx|connexor)!;
-
-# Get file info from command line
-my $file = $ARGV[0] or die pod2usage(1);
-my $out_file = $ARGV[1] || 'clean_' . $file;
-
-# Load file and jsonify
-my $j = j(slurp $file);
-
-# Read fields
-my ($tokens, $stream);
-
-# Clean tokens
-sub _clean ($) {
- return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
- split / /, $_
-};
-
-# Legacy index file
-if ($tokens = $j->{fields}->[1]) {
-
- # Strip annotation info
- foreach (qw/layerInfo foundries/) {
- $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
- };
-
- # Read data
- $stream = $tokens->{data};
-}
-
-# New index file
-elsif ($tokens = $j->{data}) {
- # Strip annotation info
- foreach (qw/layerInfos foundries/) {
- $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
- };
-
- # Read data
- $stream = $tokens->{stream};
-};
-
-# Clean data from xip and cnx
-my $clean_data = [];
-foreach my $token (@$stream) {
- my $clean_token = [];
- foreach my $term (@$token) {
- if ($term !~ /^(?:(?:<>|<|>|@|-):)?$COMM_FOUNDRIES/o) {
- push @$clean_token, $term;
- };
- };
- push @$clean_data, $clean_token;
-};
-
-# Legacy index file
-if ($tokens->{data}) {
- $tokens->{data} = $clean_data;
-}
-
-# New index file
-elsif ($tokens->{stream}) {
- $tokens->{stream} = $clean_data;
-};
-
-# Write file
-spurt j($j), $out_file;
-
-__END__
-
-=pod
-
-=head1 NAME
-
-strip_commercial_annotations
-
-=head1 SYNOPSIS
-
-perl strip_commercial_annotations my_file.json [my_clean_file.json]
-
-=cut
diff --git a/t/sgbr/lemma.t b/t/sgbr/lemma.t
index 27fe073..3971b88 100644
--- a/t/sgbr/lemma.t
+++ b/t/sgbr/lemma.t
@@ -26,7 +26,7 @@
ok($tokens->parse, 'Parse tokenization based on lemmata');
-ok($tokens->add('Schreibgebrauch', 'Lemma'), 'Add Structure');
+ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
my $data = $tokens->to_data->{data};
diff --git a/t/sgbr/pos.t b/t/sgbr/pos.t
index 417af0c..feb357a 100644
--- a/t/sgbr/pos.t
+++ b/t/sgbr/pos.t
@@ -26,7 +26,7 @@
ok($tokens->parse, 'Parse tokenization based on lemmata');
-ok($tokens->add('Schreibgebrauch', 'Morpho'), 'Add Structure');
+ok($tokens->add('Sgbr', 'Morpho'), 'Add Structure');
my $data = $tokens->to_data->{data};
diff --git a/t/transform.t b/t/transform.t
index fd751f7..b2620f3 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -25,7 +25,7 @@
};
my @layers;
-# push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
# OpenNLP
@@ -35,8 +35,18 @@
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
+push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
+push(@layers, ['DeReKo', 'Structure']);
+
+push(@layers, ['Glemm', 'Morpho']);
+
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+push(@layers, ['Malt', 'Dependency']);
+
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
@@ -48,7 +58,7 @@
push(@layers, ['TreeTagger', 'Sentences']);
# Mate
-# push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# XIP