Update scripts and sgbr test suite
Change-Id: I3d584fd559d84eca80da6c5fbe257edab79b1017
diff --git a/script/create_example.pl b/script/create_example.pl
deleted file mode 100755
index 289b946..0000000
--- a/script/create_example.pl
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use warnings;
-use FindBin;
-use v5.16;
-
-my $dir = $FindBin::Bin;
-
-foreach my $file (qw/00001
- 00002
- 00003
- 00004
- 00005
- 00006
- 02035-substring
- 02439
- 05663-unbalanced
- 07452-deep/) {
- my $out = $dir . '/../' . $file . '.json';
-
- my $call = 'perl ' . $dir . '/prepare_index.pl -i ' . $dir . '/../examples/WPD/AAA/' . $file . ' -o ' . $out;
- print 'Create ' . $out . "\n";
- system($call);
-
- print 'Create ' . $out . ".gz\n";
- $call .= '.gz -z';
- system($call);
-};
diff --git a/script/prepare_index.pl b/script/korapxml2krill
similarity index 91%
rename from script/prepare_index.pl
rename to script/korapxml2krill
index 486032a..96c3b23 100644
--- a/script/prepare_index.pl
+++ b/script/korapxml2krill
@@ -6,21 +6,26 @@
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
-use KorAP::Document;
-use KorAP::Tokenizer;
-our $VERSION = 0.03;
+use KorAP::XML::Krill;
+use KorAP::XML::Tokenizer;
+
+our $VERSION = 0.04;
# Merges foundry data to create indexer friendly documents
# ndiewald, 2014/10/29
+# 2016/02/04
+# - renamed to korapxml2krill
+# - added Schreibgebrauch support
+
sub printhelp {
print <<'EOHELP';
Merge foundry data based on a tokenization and create indexer friendly documents.
Call:
-prepare_index.pl -z --input <directory> --output <filename>
+korapxml2krill -z --input <directory> --output <filename>
--input|-i <directory> Directory of the document to index
--output|-o <filename> Document name for output (optional),
@@ -46,7 +51,7 @@
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
-diewald@ids-mannheim.de, 2014/11/05
+diewald@ids-mannheim.de, 2016/02/04
EOHELP
exit(defined $_[0] ? $_[0] : 0);
@@ -107,11 +112,11 @@
$main::LAST_STOP = $new;
};
-# Call perl script/prepare_index.pl WPD/AAA/00001
+# Call perl script/korapxml2krill WPD/AAA/00001
# Create and parse new document
$input =~ s{([^/])$}{$1/};
-my $doc = KorAP::Document->new( path => $input );
+my $doc = KorAP::XML::Krill->new( path => $input );
unless ($doc->parse) {
$log->warn($output . " can't be processed - no document data");
@@ -124,7 +129,7 @@
};
# Get tokenization
-my $tokens = KorAP::Tokenizer->new(
+my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
@@ -161,7 +166,7 @@
push(@layers, ['Glemm', 'Morpho']);
# Malt
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
@@ -172,14 +177,13 @@
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
-push(@layers, ['Schreibgebrauch', 'Lemma']);
-push(@layers, ['Schreibgebrauch', 'Morpho']);
+push(@layers, ['Sgbr', 'Lemma']);
+push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
-
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
diff --git a/script/wrap_folders.pl b/script/korapxml2krill_dir
similarity index 89%
rename from script/wrap_folders.pl
rename to script/korapxml2krill_dir
index b3f8aad..7c38efc 100644
--- a/script/wrap_folders.pl
+++ b/script/korapxml2krill_dir
@@ -7,6 +7,13 @@
my $local = $FindBin::Bin;
+# Changes
+# 2013/11/25
+# - Initial release
+#
+# 2016/02/04
+# - Rename to korapxml2krill_dir
+
sub printhelp {
print <<'EOHELP';
@@ -14,7 +21,7 @@
for whole directories.
Call:
-wrap_folders.pl -z --input <directory> --output <directory>
+korapxml2krill_dir -z --input <directory> --output <directory>
--input|-i <directory> Directory of documents to index
--output|-o <directory> Name of output folder
@@ -38,7 +45,7 @@
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
-diewald@ids-mannheim.de, 2013/11/25
+diewald@ids-mannheim.de, 2016/02/04
EOHELP
@@ -70,7 +77,7 @@
$file =~ tr/\//-/;
$file =~ s{^-+}{};
- my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+ my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
$call .= ' -m' if $text;
$call .= ' -l ' . $log_level if $log_level;
@@ -78,6 +85,7 @@
$call .= ' -y ' . $pretty if $pretty;
$call .= ' -a ' . $_ foreach @allow;
$call .= ' -s ' . $_ foreach @skip;
+ print "Convert $file\n";
system($call);
};
diff --git a/script/strip_commercial_annotations b/script/strip_commercial_annotations
deleted file mode 100644
index 7039b04..0000000
--- a/script/strip_commercial_annotations
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/env perl
-use Mojo::Base -strict;
-use Mojo::JSON 'j';
-use Mojo::Util qw/slurp spurt/;
-use Pod::Usage;
-
-####
-# Remove xip and cnx foundries from legacy index files
-# This needs the installation of the Mojolicious package:
-# $ cpan install Mojolicious
-####
-
-our @ARGV;
-my $COMM_FOUNDRIES = qr!(?:xip|cnx|connexor)!;
-
-# Get file info from command line
-my $file = $ARGV[0] or die pod2usage(1);
-my $out_file = $ARGV[1] || 'clean_' . $file;
-
-# Load file and jsonify
-my $j = j(slurp $file);
-
-# Read fields
-my ($tokens, $stream);
-
-# Clean tokens
-sub _clean ($) {
- return join ' ', grep { $_ !~ $COMM_FOUNDRIES }
- split / /, $_
-};
-
-# Legacy index file
-if ($tokens = $j->{fields}->[1]) {
-
- # Strip annotation info
- foreach (qw/layerInfo foundries/) {
- $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
- };
-
- # Read data
- $stream = $tokens->{data};
-}
-
-# New index file
-elsif ($tokens = $j->{data}) {
- # Strip annotation info
- foreach (qw/layerInfos foundries/) {
- $tokens->{$_} = _clean $tokens->{$_} if $tokens->{$_};
- };
-
- # Read data
- $stream = $tokens->{stream};
-};
-
-# Clean data from xip and cnx
-my $clean_data = [];
-foreach my $token (@$stream) {
- my $clean_token = [];
- foreach my $term (@$token) {
- if ($term !~ /^(?:(?:<>|<|>|@|-):)?$COMM_FOUNDRIES/o) {
- push @$clean_token, $term;
- };
- };
- push @$clean_data, $clean_token;
-};
-
-# Legacy index file
-if ($tokens->{data}) {
- $tokens->{data} = $clean_data;
-}
-
-# New index file
-elsif ($tokens->{stream}) {
- $tokens->{stream} = $clean_data;
-};
-
-# Write file
-spurt j($j), $out_file;
-
-__END__
-
-=pod
-
-=head1 NAME
-
-strip_commercial_annotations
-
-=head1 SYNOPSIS
-
-perl strip_commercial_annotations my_file.json [my_clean_file.json]
-
-=cut