Added archive support to korapxml2krill_dir
Change-Id: Ib62934a08628db3667891a1562acbf0149c17482
diff --git a/Changes b/Changes
index 7350491..8a28f26 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.08 2016-02-14
+ - Added support for archive streaming.
+ - Improved scripts.
+
0.07 2016-02-13
- Improved support for Schreibgebrauch meta data
(IDS flavour).
diff --git a/MANIFEST b/MANIFEST
index a63f2e9..fc4299e 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,5 +1,6 @@
lib/KorAP/XML/Krill.pm
lib/KorAP/XML/Log.pm
+lib/KorAP/XML/Archive.pm
lib/KorAP/XML/Tokenizer.pm
lib/KorAP/XML/Tokenizer/Match.pm
lib/KorAP/XML/Tokenizer/Range.pm
@@ -40,6 +41,7 @@
lib/KorAP/XML/Index/XIP/Dependency.pm
lib/KorAP/XML/Index/XIP/Morpho.pm
lib/KorAP/XML/Index/XIP/Sentences.pm
+t/archive.t
t/meta.t
t/primary.t
t/range.t
@@ -84,6 +86,7 @@
t/sgbr/meta_ids.t
t/sgbr/pos.t
t/sgbr/token.t
+t/corpus/archive.zip
t/corpus/BZK/header.xml
t/corpus/GOE/header.xml
t/corpus/VDI/header.xml
diff --git a/Makefile.PL b/Makefile.PL
index 1b99119..7bfcb5e 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -21,6 +21,7 @@
'Array::IntSpan' => 2.003,
'List::MoreUtils' => 0.33,
'IO::Dir::Recursive' => 0.03,
+ 'File::Temp' => 0,
'Directory::Iterator' => 0,
'Benchmark' => 0,
'Carp' => 0,
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
new file mode 100644
index 0000000..d8dbc27
--- /dev/null
+++ b/lib/KorAP/XML/Archive.pm
@@ -0,0 +1,148 @@
+package KorAP::XML::Archive;
+use Carp qw/carp/;
+use File::Spec::Functions qw(rel2abs);
+use strict;
+use warnings;
+
+# Convert new archive helper
+sub new {
+ my $class = shift;
+ my $file = shift or return;
+ bless \$file, $class;
+};
+
+
+# Check if unzip is installed
+sub test_unzip {
+ return 1 if grep { -x "$_/unzip"} split /:/, $ENV{PATH};
+ return;
+};
+
+# Check the compressed archive
+sub test {
+ my $self = shift;
+ my $file = $$self;
+ my $out = `unzip -t $file`;
+ if ($out =~ /no errors/i) {
+ return 1;
+ };
+ return 0;
+};
+
+
+# List all text paths contained in the file
+sub list_texts {
+ my $self = shift;
+ my $file = $$self;
+ my %texts;
+ foreach (`unzip -l $file *.xml`) {
+ if ($_ =~ m![\t\s]
+ ((?:\./)?
+ [^\t\s/\.]+?/ # Corpus
+ [^\t\s/]+?/ # Document
+ [^\t\s/]+? # Text
+ )/(?:[^/]+?)\.xml$!x) {
+ $texts{$1} = 1;
+ };
+ };
+
+ return sort {$a cmp $b} keys %texts;
+};
+
+
+# Split a text path to prefix, corpus, document, text
+sub split_path {
+ my $self = shift;
+ my $text_path = shift;
+
+ unless ($text_path) {
+ carp('No text path given');
+ return 0;
+ };
+
+ # Check for '.' prefix in text
+ my $prefix = '';
+ if ($text_path =~ s!^\./!!) {
+ $prefix = '.';
+ };
+
+ # Unix form
+ if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
+ return ($prefix, $1, $2, $3);
+ }
+
+ # Windows form
+ elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
+ return ($prefix, $1, $2, $3);
+ };
+
+ # Text has not the expected pattern
+ carp $text_path . ' is not a well-formed text path in ' . $$self;
+ return;
+};
+
+
+# Get the archives path
+sub path {
+ return rel2abs(${$_[0]});
+};
+
+
+# Extract files to a directory
+sub extract {
+ my $self = shift;
+ my $text_path = shift;
+ my $target_dir = shift;
+
+ my @cmd = (
+ 'unzip', # Use unzip program
+ '-qo', # quietly overwrite all existing files
+ '-d', $target_dir # Extract into target directory
+ );
+
+ push(@cmd, $$self); # Extract from zip
+
+ my ($prefix, $corpus, $doc, $text) = $self->split_path($text_path) or return;
+
+ # Add some interesting files for extraction
+ # Can't use catfile(), as this removes the '.' prefix
+ push(@cmd, join('/', $prefix, $corpus, 'header.xml'));
+ push(@cmd, join('/', $prefix, $corpus, $doc, 'header.xml'));
+ push(@cmd, join('/', $prefix, $corpus, $doc, $text, '*'));
+
+ # Run system call
+ system(@cmd);
+
+ # Check for return code
+ if ($? != 0) {
+ carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+ return;
+ };
+
+ # Fine
+ return 1;
+};
+
+
+1;
+
+__END__
+
+=POD
+
+C<KorAP::XML::Archive> expects the unzip tool to be installed.
+
+
+=head1 new
+
+=head1 test
+
+=head1 list_texts
+
+Returns all texts found in the zip file
+
+=head1 extract
+
+ $archive->extract('./GOE/AGU/0004', '~/temp');
+
+Extract all files for the named text to a certain directory.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 404c2b2..decae35 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -17,7 +17,7 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.07';
+our $VERSION = '0.08';
our @ATTR = qw/text_sigle
doc_sigle
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5ec0805..9e2d1e8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -10,8 +10,6 @@
use KorAP::XML::Krill;
use KorAP::XML::Tokenizer;
-our $VERSION = 0.04;
-
# Merges foundry data to create indexer friendly documents
# ndiewald, 2014/10/29
@@ -21,7 +19,14 @@
#
# 2016/02/12
# - fixed foundry skipping
+#
+# 2016/02/14
+# - Added version information
+sub printversion {
+ print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+ exit(1);
+};
sub printhelp {
print <<'EOHELP';
@@ -54,8 +59,9 @@
(expects a defined output file)
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
+ --version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
EOHELP
exit(defined $_[0] ? $_[0] : 0);
@@ -76,7 +82,8 @@
'allow|a=s' => \@allow,
'primary|p!' => \$primary,
'pretty|y' => \$pretty,
- 'help|h' => sub { printhelp }
+ 'help|h' => sub { printhelp },
+ 'version|v' => sub { printversion }
);
printhelp(1) if !$input || ($gzip && !$output);
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index 7b048cf..5b09566 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -1,9 +1,14 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use lib 'lib';
use FindBin;
+use File::Temp qw/tempdir/;
+use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long;
use Directory::Iterator;
+use KorAP::XML::Krill;
+use KorAP::XML::Archive;
my $local = $FindBin::Bin;
@@ -16,6 +21,14 @@
#
# 2016/02/12
# - Support overwrite
+#
+# 2016/02/14
+# - Added version information
+
+sub printversion {
+ print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+ exit(1);
+};
sub printhelp {
print <<'EOHELP';
@@ -26,7 +39,7 @@
Call:
korapxml2krill_dir -z --input <directory> --output <directory>
- --input|-i <directory> Directory of documents to index
+ --input|-i <directory|file> Directory or archive file of documents to index
--output|-o <directory> Name of output folder
--overwrite|-w Overwrite files that already exist
--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
@@ -48,8 +61,9 @@
(expects a defined output file)
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
+ --version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
EOHELP
@@ -70,12 +84,14 @@
'allow|a=s' => \@allow,
'primary|p!' => \$primary,
'pretty|y' => \$pretty,
- 'help|h' => sub { printhelp }
+ 'help|h' => sub { printhelp },
+ 'version|v' => sub { printversion }
);
printhelp(1) if !$input || !$output;
+# write file
sub write_file {
my $anno = shift;
my $file = $anno;
@@ -98,23 +114,71 @@
print "\n";
};
-
-my $it = Directory::Iterator->new($input);
-my @dirs;
-my $dir;
-while (1) {
-
+# Input is a directory
+if (-d $input) {
+ my $it = Directory::Iterator->new($input);
+ my @dirs;
+ my $dir;
+ while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
- push @dirs, $dir;
- $it->prune;
+ push @dirs, $dir;
+ $it->prune;
};
- last unless $it->next;
-};
+ last unless $it->next;
+ };
-my $count = scalar @dirs;
-for (my $i = 0; $i < $count; $i++) {
- print 'Convert [' . ($i + 1) . "/$count] ";
- write_file($dirs[$i]);
+ my $count = scalar @dirs;
+ for (my $i = 0; $i < $count; $i++) {
+ print 'Convert [' . ($i + 1) . "/$count] ";
+ write_file($dirs[$i]);
+ };
+}
+
+# Input is a file
+elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+ unless ($archive->test_unzip) {
+ print "Unzip is not installed or incompatible.\n\n";
+ exit(1);
+ };
+
+ unless ($archive->test) {
+ print "Zip archive not compatible.\n\n";
+ exit(1);
+ };
+
+ my @dirs = $archive->list_texts;
+ my $count = scalar @dirs;
+ for (my $i = 0; $i < $count; $i++) {
+ print 'Convert [' . ($i + 1) . "/$count] ";
+
+ # Split path information
+ my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+ # Create temporary file
+ my $temp = tempdir(CLEANUP => 1);
+
+ # Extract from archive
+ if ($archive->extract($dirs[$i], $temp)) {
+
+ # Create corpus directory
+ $input = catdir($temp, $corpus);
+
+ # Temporary directory
+ my $dir = catdir($input, $doc, $text);
+
+ # Write file
+ write_file($dir);
+ }
+ else {
+ print "Unable to extract " . $dirs[$i] . "\n";
+ };
+
+ $temp = 0;
+ };
+}
+
+else {
+ print "Input is neither a directory nor an archive.\n\n";
};
diff --git a/t/archive.t b/t/archive.t
new file mode 100644
index 0000000..4a729b6
--- /dev/null
+++ b/t/archive.t
@@ -0,0 +1,47 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile catdir/;
+use File::Temp qw/tempdir/;
+
+use_ok('KorAP::XML::Archive');
+
+my $file = catfile(dirname(__FILE__), 'corpus','archive.zip');
+my $archive = KorAP::XML::Archive->new($file);
+
+unless ($archive->test_unzip) {
+ plan skip_all => 'unzip not found';
+};
+
+ok($archive->test, 'Test archive');
+like($archive->path, qr/archive\.zip$/, 'Archive path');
+
+my @list = $archive->list_texts;
+is(scalar @list, 10, 'Found all tests');
+is($list[0], './TEST/BSP/1', 'First document');
+is($list[-1], './TEST/BSP/9', 'First document');
+
+my @path = $archive->split_path('./TEST/BSP/9');
+is($path[0],'.', 'Prefix');
+is($path[1],'TEST', 'Prefix');
+is($path[2],'BSP', 'Prefix');
+is($path[3],'9', 'Prefix');
+
+my $dir = tempdir(CLEANUP => 1);
+
+{
+ local $SIG{__WARN__} = sub {};
+ ok($archive->extract('./TEST/BSP/8', $dir), 'Wrong path');
+};
+
+ok(-d catdir($dir, 'TEST'), 'Test corpus directory exists');
+ok(-f catdir($dir, 'TEST', 'header.xml'), 'Test corpus header exists');
+ok(-d catdir($dir, 'TEST', 'BSP'), 'Test doc directory exists');
+ok(-f catdir($dir, 'TEST', 'BSP', 'header.xml'), 'Test doc header exists');
+
+
+done_testing;
+
+__END__
diff --git a/t/corpus/archive.zip b/t/corpus/archive.zip
new file mode 100644
index 0000000..b88d147
--- /dev/null
+++ b/t/corpus/archive.zip
Binary files differ