Update scripts and sgbr test suite Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017

commit: 93d620e46dba1a790845408bd92571eb22c854e3 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
committer: Akron <nils@diewald-online.de> Fri Feb 05 19:40:05 2016 +0100
tree: 2f4846549357ce96ed94e14083ac5248177cda38
parent: 6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff]
diff --git a/script/create_example.pl b/script/create_example.pl
deleted file mode 100755
index 289b946..0000000
--- a/script/create_example.pl
+++ /dev/null

@@ -1,28 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use warnings;
-use FindBin;
-use v5.16;
-
-my $dir = $FindBin::Bin;
-
-foreach my $file (qw/00001
-		     00002
-		     00003
-		     00004
-		     00005
-		     00006
-		     02035-substring
-		     02439
-		     05663-unbalanced
-		     07452-deep/) {
-  my $out = $dir . '/../' . $file . '.json';
-
-  my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
-  print 'Create ' . $out . "\n";
-  system($call);
-
-  print 'Create ' . $out . ".gz\n";
-  $call .= '.gz -z';
-  system($call);
-};

diff --git a/script/prepare_index.pl b/script/korapxml2krill
similarity index 91%
rename from script/prepare_index.pl
rename to script/korapxml2krill
index 486032a..96c3b23 100644
--- a/script/prepare_index.pl
+++ b/script/korapxml2krill

@@ -6,21 +6,26 @@
 use Benchmark qw/:hireswallclock/;
 use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
-use KorAP::Document;
-use KorAP::Tokenizer;
 
-our $VERSION = 0.03;
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
 
 # Merges foundry data to create indexer friendly documents
 # ndiewald, 2014/10/29
 
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
 sub printhelp {
   print <<'EOHELP';
 
 Merge foundry data based on a tokenization and create indexer friendly documents.
 
 Call:
-prepare_index.pl -z --input <directory> --output <filename>
+korapxml2krill -z --input <directory> --output <filename>
 
 --input|-i <directory>          Directory of the document to index
 --output|-o <filename>          Document name for output (optional),
@@ -46,7 +51,7 @@
 --log|-l                        The Log4perl log level, defaults to ERROR.
 --help|-h                       Print this document (optional)
 
-diewald@ids-mannheim.de, 2014/11/05
+diewald@ids-mannheim.de, 2016/02/04
 
 EOHELP
   exit(defined $_[0] ? $_[0] : 0);
@@ -107,11 +112,11 @@
   $main::LAST_STOP = $new;
 };
 
-# Call perl script/prepare_index.pl WPD/AAA/00001
+# Call perl script/korapxml2krill WPD/AAA/00001
 
 # Create and parse new document
 $input =~ s{([^/])$}{$1/};
-my $doc = KorAP::Document->new( path => $input );
+my $doc = KorAP::XML::Krill->new( path => $input );
 
 unless ($doc->parse) {
   $log->warn($output . " can't be processed - no document data");
@@ -124,7 +129,7 @@
 };
 
 # Get tokenization
-my $tokens = KorAP::Tokenizer->new(
+my $tokens = KorAP::XML::Tokenizer->new(
   path => $doc->path,
   doc => $doc,
   foundry => $token_base_foundry,
@@ -161,7 +166,7 @@
 push(@layers, ['Glemm', 'Morpho']);
 
 # Malt
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
 
 # Mate
 push(@layers, ['Mate', 'Morpho']);
@@ -172,14 +177,13 @@
 push(@layers, ['OpenNLP', 'Sentences']);
 
 # Schreibgebrauch
-push(@layers, ['Schreibgebrauch', 'Lemma']);
-push(@layers, ['Schreibgebrauch', 'Morpho']);
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
 
 # TreeTagger
 push(@layers, ['TreeTagger', 'Morpho']);
 push(@layers, ['TreeTagger', 'Sentences']);
 
-
 # XIP
 push(@layers, ['XIP', 'Morpho']);
 push(@layers, ['XIP', 'Constituency']);

diff --git a/script/wrap_folders.pl b/script/korapxml2krill_dir
similarity index 89%
rename from script/wrap_folders.pl
rename to script/korapxml2krill_dir
index b3f8aad..7c38efc 100644
--- a/script/wrap_folders.pl
+++ b/script/korapxml2krill_dir

@@ -7,6 +7,13 @@
 
 my $local = $FindBin::Bin;
 
+# Changes
+# 2013/11/25
+# - Initial release
+#
+# 2016/02/04
+# - Rename to korapxml2krill_dir
+
 sub printhelp {
   print <<'EOHELP';
 
@@ -14,7 +21,7 @@
 for whole directories.
 
 Call:
-wrap_folders.pl -z --input <directory> --output <directory>
+korapxml2krill_dir -z --input <directory> --output <directory>
 
 --input|-i <directory>          Directory of documents to index
 --output|-o <directory>         Name of output folder
@@ -38,7 +45,7 @@
 --log|-l                        The Log4perl log level, defaults to ERROR.
 --help|-h                       Print this document (optional)
 
-diewald@ids-mannheim.de, 2013/11/25
+diewald@ids-mannheim.de, 2016/02/04
 
 EOHELP
 
@@ -70,7 +77,7 @@
   $file =~ tr/\//-/;
   $file =~ s{^-+}{};
 
-  my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+  my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
   $call .= '.gz -z' if $gzip;
   $call .= ' -m' if $text;
   $call .= ' -l ' . $log_level if $log_level;
@@ -78,6 +85,7 @@
   $call .= ' -y ' . $pretty if $pretty;
   $call .= ' -a ' . $_ foreach @allow;
   $call .= ' -s ' . $_ foreach @skip;
+  print "Convert $file\n";
   system($call);
 };
 

diff --git a/script/strip_commercial_annotations b/script/strip_commercial_annotations
deleted file mode 100644
index 7039b04..0000000
--- a/script/strip_commercial_annotations
+++ /dev/null

@@ -1,92 +0,0 @@
-#!/usr/bin/env perl
-use Mojo::Base -strict;
-use Mojo::JSON 'j';
-use Mojo::Util qw/slurp spurt/;
-use Pod::Usage;
-
-####
-# Remove xip and cnx foundries from legacy index files
-# This needs the installation of the Mojolicious package:
-# $ cpan install Mojolicious
-####
-
-our @ARGV;
-my $COMM_FOUNDRIES = qr!(?:xip|cnx|connexor)!;
-
-# Get file info from command line
-my $file     = $ARGV[0] or die pod2usage(1);
-my $out_file = $ARGV[1] || 'clean_' . $file;
-
-# Load file and jsonify
-my $j = j(slurp $file);
-
-# Read fields
-my ($tokens, $stream);
-
-# Clean tokens
-sub _clean ($) {
-  return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
-    split / /, $_
-};
-
-# Legacy index file
-if ($tokens = $j->{fields}->[1]) {
-
-  # Strip annotation info
-  foreach (qw/layerInfo foundries/) {
-    $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
-  };
-
-  # Read data
-  $stream = $tokens->{data};
-}
-
-# New index file
-elsif ($tokens = $j->{data}) {
-  # Strip annotation info
-  foreach (qw/layerInfos foundries/) {
-    $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
-  };
-
-  # Read data
-  $stream = $tokens->{stream};
-};
-
-# Clean data from xip and cnx
-my $clean_data = [];
-foreach my $token (@$stream) {
-  my $clean_token = [];
-  foreach my $term (@$token) {
-    if ($term !~ /^(?:(?:<>|<|>|@|-):)?$COMM_FOUNDRIES/o) {
-      push @$clean_token, $term;
-    };
-  };
-  push @$clean_data, $clean_token;
-};
-
-# Legacy index file
-if ($tokens->{data}) {
-  $tokens->{data} = $clean_data;
-}
-
-# New index file
-elsif ($tokens->{stream}) {
-  $tokens->{stream} = $clean_data;
-};
-
-# Write file
-spurt j($j), $out_file;
-
-__END__
-
-=pod
-
-=head1 NAME
-
-strip_commercial_annotations
-
-=head1 SYNOPSIS
-
-perl strip_commercial_annotations my_file.json [my_clean_file.json]
-
-=cut
commit	93d620e46dba1a790845408bd92571eb22c854e3	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
committer	Akron <nils@diewald-online.de>	Fri Feb 05 19:40:05 2016 +0100
tree	2f4846549357ce96ed94e14083ac5248177cda38
parent	6764d55ed817f9f70ce83be86c56f16525cbb3a1 [diff]