Update scripts and sgbr test suite Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017

commit: 93d620e46dba1a790845408bd92571eb22c854e3 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
committer: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
tree: 2f4846549357ce96ed94e14083ac5248177cda38
parent: 6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff]
diff --git a/.gitignore b/.gitignore
index 8ab84ac..832d443 100644
--- a/.gitignore
+++ b/.gitignore

@@ -4,11 +4,11 @@
 benchmark
 docs
 todo.org
+tools
 fixtures.txt
 node_modules
 log
 blib
-script*
 MYMETA*
 Makefile
 pm_to_blib

diff --git a/Changes b/Changes
index 5f7e5ea..7b581b8 100644
--- a/Changes
+++ b/Changes

@@ -1,5 +1,6 @@
-0.05 2016-01-28
+0.05 2016-02-04
         - Changed KorAP::Document to KorAP::XML::Krill.
+	- Renamed "Schreibgebrauch" to "Sgbr".
 	- Preparation for GitHub release.
 
 0.04 2016-01-28

diff --git a/MANIFEST b/MANIFEST
index dcba338..b37db9c 100755
--- a/MANIFEST
+++ b/MANIFEST

@@ -75,10 +75,10 @@
 t/real/bzk_2.t
 t/real/goethe.t
 t/real/wdd.t
-t/sgbr/sgbr_lemma.t
-t/sgbr/sgbr_meta.t
-t/sgbr/sgbr_pos.t
-t/sgbr/sgbr_token.t
+t/sgbr/lemma.t
+t/sgbr/meta.t
+t/sgbr/pos.t
+t/sgbr/token.t
 t/corpus/BZK/header.xml
 t/corpus/GOE/header.xml
 t/corpus/VDI/header.xml

diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index cf1e5c3..5019e2f 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm

@@ -198,8 +198,8 @@
 
     # Check depth
     else {
-      my ($a_depth) = ($a->[0] =~ m/^<b>(\d+)/);
-      my ($b_depth) = ($b->[0] =~ m/^<b>(\d+)/);
+      my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)$/ : 0);
+      my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)$/ : 0);
 
       $a_depth //= 0;
       $b_depth //= 0;

diff --git a/lib/KorAP/XML/Index/Sgbr/Lemma.pm b/lib/KorAP/XML/Index/Sgbr/Lemma.pm
index fb2fee4..81726be 100644
--- a/lib/KorAP/XML/Index/Sgbr/Lemma.pm
+++ b/lib/KorAP/XML/Index/Sgbr/Lemma.pm

@@ -1,4 +1,4 @@
-package KorAP::XML::Index::Schreibgebrauch::Lemma;
+package KorAP::XML::Index::Sgbr::Lemma;
 use KorAP::XML::Index::Base;
 use Mojo::ByteStream 'b';
 

diff --git a/lib/KorAP/XML/Index/Sgbr/Morpho.pm b/lib/KorAP/XML/Index/Sgbr/Morpho.pm
index eb49f85..fac17a7 100644
--- a/lib/KorAP/XML/Index/Sgbr/Morpho.pm
+++ b/lib/KorAP/XML/Index/Sgbr/Morpho.pm

@@ -1,4 +1,4 @@
-package KorAP::XML::Index::Schreibgebrauch::Morpho;
+package KorAP::XML::Index::Sgbr::Morpho;
 use KorAP::XML::Index::Base;
 
 sub parse {

diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 47f4379..b1c6f20 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm

@@ -382,11 +382,11 @@
       return $retval;
     }
     else {
-      $self->log->error('Unable to parse '.$mod);
+      $self->log->warn('Unable to parse '.$mod);
     };
   }
   else {
-    $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
+    $self->log->warn('Unable to load '.$mod . '(' . $@ . ')');
   };
 
   return;

diff --git a/script/create_example.pl b/script/create_example.pl
deleted file mode 100755
index 289b946..0000000
--- a/script/create_example.pl
+++ /dev/null

@@ -1,28 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use warnings;
-use FindBin;
-use v5.16;
-
-my $dir = $FindBin::Bin;
-
-foreach my $file (qw/00001
-		     00002
-		     00003
-		     00004
-		     00005
-		     00006
-		     02035-substring
-		     02439
-		     05663-unbalanced
-		     07452-deep/) {
-  my $out = $dir . '/../' . $file . '.json';
-
-  my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
-  print 'Create ' . $out . "\n";
-  system($call);
-
-  print 'Create ' . $out . ".gz\n";
-  $call .= '.gz -z';
-  system($call);
-};

diff --git a/script/prepare_index.pl b/script/korapxml2krill
similarity index 91%
rename from script/prepare_index.pl
rename to script/korapxml2krill
index 486032a..96c3b23 100644
--- a/script/prepare_index.pl
+++ b/script/korapxml2krill

@@ -6,21 +6,26 @@
 use Benchmark qw/:hireswallclock/;
 use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
-use KorAP::Document;
-use KorAP::Tokenizer;
 
-our $VERSION = 0.03;
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
 
 # Merges foundry data to create indexer friendly documents
 # ndiewald, 2014/10/29
 
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
 sub printhelp {
   print <<'EOHELP';
 
 Merge foundry data based on a tokenization and create indexer friendly documents.
 
 Call:
-prepare_index.pl -z --input <directory> --output <filename>
+korapxml2krill -z --input <directory> --output <filename>
 
 --input|-i <directory>          Directory of the document to index
 --output|-o <filename>          Document name for output (optional),
@@ -46,7 +51,7 @@
 --log|-l                        The Log4perl log level, defaults to ERROR.
 --help|-h                       Print this document (optional)
 
-diewald@ids-mannheim.de, 2014/11/05
+diewald@ids-mannheim.de, 2016/02/04
 
 EOHELP
   exit(defined $_[0] ? $_[0] : 0);
@@ -107,11 +112,11 @@
   $main::LAST_STOP = $new;
 };
 
-# Call perl script/prepare_index.pl WPD/AAA/00001
+# Call perl script/korapxml2krill WPD/AAA/00001
 
 # Create and parse new document
 $input =~ s{([^/])$}{$1/};
-my $doc = KorAP::Document->new( path => $input );
+my $doc = KorAP::XML::Krill->new( path => $input );
 
 unless ($doc->parse) {
   $log->warn($output . " can't be processed - no document data");
@@ -124,7 +129,7 @@
 };
 
 # Get tokenization
-my $tokens = KorAP::Tokenizer->new(
+my $tokens = KorAP::XML::Tokenizer->new(
   path => $doc->path,
   doc => $doc,
   foundry => $token_base_foundry,
@@ -161,7 +166,7 @@
 push(@layers, ['Glemm', 'Morpho']);
 
 # Malt
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
 
 # Mate
 push(@layers, ['Mate', 'Morpho']);
@@ -172,14 +177,13 @@
 push(@layers, ['OpenNLP', 'Sentences']);
 
 # Schreibgebrauch
-push(@layers, ['Schreibgebrauch', 'Lemma']);
-push(@layers, ['Schreibgebrauch', 'Morpho']);
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
 
 # TreeTagger
 push(@layers, ['TreeTagger', 'Morpho']);
 push(@layers, ['TreeTagger', 'Sentences']);
 
-
 # XIP
 push(@layers, ['XIP', 'Morpho']);
 push(@layers, ['XIP', 'Constituency']);

diff --git a/script/wrap_folders.pl b/script/korapxml2krill_dir
similarity index 89%
rename from script/wrap_folders.pl
rename to script/korapxml2krill_dir
index b3f8aad..7c38efc 100644
--- a/script/wrap_folders.pl
+++ b/script/korapxml2krill_dir

@@ -7,6 +7,13 @@
 
 my $local = $FindBin::Bin;
 
+# Changes
+# 2013/11/25
+# - Initial release
+#
+# 2016/02/04
+# - Rename to korapxml2krill_dir
+
 sub printhelp {
   print <<'EOHELP';
 
@@ -14,7 +21,7 @@
 for whole directories.
 
 Call:
-wrap_folders.pl -z --input <directory> --output <directory>
+korapxml2krill_dir -z --input <directory> --output <directory>
 
 --input|-i <directory>          Directory of documents to index
 --output|-o <directory>         Name of output folder
@@ -38,7 +45,7 @@
 --log|-l                        The Log4perl log level, defaults to ERROR.
 --help|-h                       Print this document (optional)
 
-diewald@ids-mannheim.de, 2013/11/25
+diewald@ids-mannheim.de, 2016/02/04
 
 EOHELP
 
@@ -70,7 +77,7 @@
   $file =~ tr/\//-/;
   $file =~ s{^-+}{};
 
-  my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+  my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
   $call .= '.gz -z' if $gzip;
   $call .= ' -m' if $text;
   $call .= ' -l ' . $log_level if $log_level;
@@ -78,6 +85,7 @@
   $call .= ' -y ' . $pretty if $pretty;
   $call .= ' -a ' . $_ foreach @allow;
   $call .= ' -s ' . $_ foreach @skip;
+  print "Convert $file\n";
   system($call);
 };
 

diff --git a/script/strip_commercial_annotations b/script/strip_commercial_annotations
deleted file mode 100644
index 7039b04..0000000
--- a/script/strip_commercial_annotations
+++ /dev/null

@@ -1,92 +0,0 @@
-#!/usr/bin/env perl
-use Mojo::Base -strict;
-use Mojo::JSON 'j';
-use Mojo::Util qw/slurp spurt/;
-use Pod::Usage;
-
-####
-# Remove xip and cnx foundries from legacy index files
-# This needs the installation of the Mojolicious package:
-# $ cpan install Mojolicious
-####
-
-our @ARGV;
-my $COMM_FOUNDRIES = qr!(?:xip|cnx|connexor)!;
-
-# Get file info from command line
-my $file     = $ARGV[0] or die pod2usage(1);
-my $out_file = $ARGV[1] || 'clean_' . $file;
-
-# Load file and jsonify
-my $j = j(slurp $file);
-
-# Read fields
-my ($tokens, $stream);
-
-# Clean tokens
-sub _clean ($) {
-  return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
-    split / /, $_
-};
-
-# Legacy index file
-if ($tokens = $j->{fields}->[1]) {
-
-  # Strip annotation info
-  foreach (qw/layerInfo foundries/) {
-    $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
-  };
-
-  # Read data
-  $stream = $tokens->{data};
-}
-
-# New index file
-elsif ($tokens = $j->{data}) {
-  # Strip annotation info
-  foreach (qw/layerInfos foundries/) {
-    $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
-  };
-
-  # Read data
-  $stream = $tokens->{stream};
-};
-
-# Clean data from xip and cnx
-my $clean_data = [];
-foreach my $token (@$stream) {
-  my $clean_token = [];
-  foreach my $term (@$token) {
-    if ($term !~ /^(?:(?:<>|<|>|@|-):)?$COMM_FOUNDRIES/o) {
-      push @$clean_token, $term;
-    };
-  };
-  push @$clean_data, $clean_token;
-};
-
-# Legacy index file
-if ($tokens->{data}) {
-  $tokens->{data} = $clean_data;
-}
-
-# New index file
-elsif ($tokens->{stream}) {
-  $tokens->{stream} = $clean_data;
-};
-
-# Write file
-spurt j($j), $out_file;
-
-__END__
-
-=pod
-
-=head1 NAME
-
-strip_commercial_annotations
-
-=head1 SYNOPSIS
-
-perl strip_commercial_annotations my_file.json [my_clean_file.json]
-
-=cut

diff --git a/t/sgbr/lemma.t b/t/sgbr/lemma.t
index 27fe073..3971b88 100644
--- a/t/sgbr/lemma.t
+++ b/t/sgbr/lemma.t

@@ -26,7 +26,7 @@
 
 ok($tokens->parse, 'Parse tokenization based on lemmata');
 
-ok($tokens->add('Schreibgebrauch', 'Lemma'), 'Add Structure');
+ok($tokens->add('Sgbr', 'Lemma'), 'Add Structure');
 
 my $data = $tokens->to_data->{data};
 

diff --git a/t/sgbr/pos.t b/t/sgbr/pos.t
index 417af0c..feb357a 100644
--- a/t/sgbr/pos.t
+++ b/t/sgbr/pos.t

@@ -26,7 +26,7 @@
 
 ok($tokens->parse, 'Parse tokenization based on lemmata');
 
-ok($tokens->add('Schreibgebrauch', 'Morpho'), 'Add Structure');
+ok($tokens->add('Sgbr', 'Morpho'), 'Add Structure');
 
 my $data = $tokens->to_data->{data};
 

diff --git a/t/transform.t b/t/transform.t
index fd751f7..b2620f3 100644
--- a/t/transform.t
+++ b/t/transform.t

@@ -25,7 +25,7 @@
 };
 
 my @layers;
-# push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Sentences']);
 push(@layers, ['Base', 'Paragraphs']);
 
 # OpenNLP
@@ -35,8 +35,18 @@
 # CoreNLP
 push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
 push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
+push(@layers, ['CoreNLP', 'NamedEntities']);
 push(@layers, ['CoreNLP', 'Sentences']);
 
+push(@layers, ['DeReKo', 'Structure']);
+
+push(@layers, ['Glemm', 'Morpho']);
+
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+push(@layers, ['Malt', 'Dependency']);
+
 # Connexor
 push(@layers, ['Connexor', 'Morpho']);
 push(@layers, ['Connexor', 'Syntax']);
@@ -48,7 +58,7 @@
 push(@layers, ['TreeTagger', 'Sentences']);
 
 # Mate
-# push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Morpho']);
 push(@layers, ['Mate', 'Dependency']);
 
 # XIP
commit	93d620e46dba1a790845408bd92571eb22c854e3	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
committer	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
tree	2f4846549357ce96ed94e14083ac5248177cda38
parent	6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff]