Added archive support to korapxml2krill_dir
Change-Id: Ib62934a08628db3667891a1562acbf0149c17482
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 5ec0805..9e2d1e8 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -10,8 +10,6 @@
use KorAP::XML::Krill;
use KorAP::XML::Tokenizer;
-our $VERSION = 0.04;
-
# Merges foundry data to create indexer friendly documents
# ndiewald, 2014/10/29
@@ -21,7 +19,14 @@
#
# 2016/02/12
# - fixed foundry skipping
+#
+# 2016/02/14
+# - Added version information
+sub printversion {
+ print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+ exit(1);
+};
sub printhelp {
print <<'EOHELP';
@@ -54,8 +59,9 @@
(expects a defined output file)
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
+ --version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
EOHELP
exit(defined $_[0] ? $_[0] : 0);
@@ -76,7 +82,8 @@
'allow|a=s' => \@allow,
'primary|p!' => \$primary,
'pretty|y' => \$pretty,
- 'help|h' => sub { printhelp }
+ 'help|h' => sub { printhelp },
+ 'version|v' => sub { printversion }
);
printhelp(1) if !$input || ($gzip && !$output);
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index 7b048cf..5b09566 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -1,9 +1,14 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use lib 'lib';
use FindBin;
+use File::Temp qw/tempdir/;
+use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long;
use Directory::Iterator;
+use KorAP::XML::Krill;
+use KorAP::XML::Archive;
my $local = $FindBin::Bin;
@@ -16,6 +21,14 @@
#
# 2016/02/12
# - Support overwrite
+#
+# 2016/02/14
+# - Added version information
+
+sub printversion {
+ print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
+ exit(1);
+};
sub printhelp {
print <<'EOHELP';
@@ -26,7 +39,7 @@
Call:
korapxml2krill_dir -z --input <directory> --output <directory>
- --input|-i <directory> Directory of documents to index
+ --input|-i <directory|file> Directory or archive file of documents to index
--output|-o <directory> Name of output folder
--overwrite|-w Overwrite files that already exist
--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
@@ -48,8 +61,9 @@
(expects a defined output file)
--log|-l The Log4perl log level, defaults to ERROR.
--help|-h Print this document (optional)
+ --version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/12
+diewald@ids-mannheim.de, 2016/02/14
EOHELP
@@ -70,12 +84,14 @@
'allow|a=s' => \@allow,
'primary|p!' => \$primary,
'pretty|y' => \$pretty,
- 'help|h' => sub { printhelp }
+ 'help|h' => sub { printhelp },
+ 'version|v' => sub { printversion }
);
printhelp(1) if !$input || !$output;
+# write file
sub write_file {
my $anno = shift;
my $file = $anno;
@@ -98,23 +114,71 @@
print "\n";
};
-
-my $it = Directory::Iterator->new($input);
-my @dirs;
-my $dir;
-while (1) {
-
+# Input is a directory
+if (-d $input) {
+ my $it = Directory::Iterator->new($input);
+ my @dirs;
+ my $dir;
+ while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
- push @dirs, $dir;
- $it->prune;
+ push @dirs, $dir;
+ $it->prune;
};
- last unless $it->next;
-};
+ last unless $it->next;
+ };
-my $count = scalar @dirs;
-for (my $i = 0; $i < $count; $i++) {
- print 'Convert [' . ($i + 1) . "/$count] ";
- write_file($dirs[$i]);
+ my $count = scalar @dirs;
+ for (my $i = 0; $i < $count; $i++) {
+ print 'Convert [' . ($i + 1) . "/$count] ";
+ write_file($dirs[$i]);
+ };
+}
+
+# Input is a file
+elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+ unless ($archive->test_unzip) {
+ print "Unzip is not installed or incompatible.\n\n";
+ exit(1);
+ };
+
+ unless ($archive->test) {
+ print "Zip archive not compatible.\n\n";
+ exit(1);
+ };
+
+ my @dirs = $archive->list_texts;
+ my $count = scalar @dirs;
+ for (my $i = 0; $i < $count; $i++) {
+ print 'Convert [' . ($i + 1) . "/$count] ";
+
+ # Split path information
+ my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+
+ # Create temporary file
+ my $temp = tempdir(CLEANUP => 1);
+
+ # Extract from archive
+ if ($archive->extract($dirs[$i], $temp)) {
+
+ # Create corpus directory
+ $input = catdir($temp, $corpus);
+
+ # Temporary directory
+ my $dir = catdir($input, $doc, $text);
+
+ # Write file
+ write_file($dir);
+ }
+ else {
+ print "Unable to extract " . $dirs[$i] . "\n";
+ };
+
+ $temp = 0;
+ };
+}
+
+else {
+ print "Input is neither a directory nor an archive.\n\n";
};