Added 'extract' method support
Change-Id: I624e79f3400b1935f9b96ceaac43553ed2f4c73c
diff --git a/Changes b/Changes
index 78b7966..f4f2d5e 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,8 @@
+0.12 2016-02-27
+ - Added extract method to korapxml2krill.
+ - Fixed Mate/Dependency.
+ - Fixed skip flag in korapxml2krill.
+
0.11 2016-02-23
- Merged korap2krill and korap2krill_dir.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index a0a8cbe..18a12b1 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -18,7 +18,7 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.11';
+our $VERSION = '0.12';
our @ATTR = qw/text_sigle
doc_sigle
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 03a8088..289c2f4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -42,9 +42,12 @@
#
# 2016/02/23
# - Merge korapxml2krill and korapxml2krill_dir
+#
+# 2016/02/27
+# - Added extract function
# ----------------------------------------------------------
-our $LAST_CHANGE = '2016/02/23';
+our $LAST_CHANGE = '2016/02/27';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -58,6 +61,8 @@
$cmd = shift @ARGV;
};
+my (@skip, @sigle);
+
# Parse options from the command line
GetOptions(
'input|i=s' => \(my $input),
@@ -66,7 +71,8 @@
'human|m' => \(my $text),
'token|t=s' => \(my $token_base),
'gzip|z' => \(my $gzip),
- 'skip|s=s' => \(my @skip),
+ 'skip|s=s' => \@skip,
+ 'sigle|sg=s' => \@sigle,
'log|l=s' => \(my $log_level = 'ERROR'),
'allow|a=s' => \(my @allow),
'primary|p!' => \(my $primary),
@@ -142,6 +148,9 @@
};
+# Convert sigle to path construct
+s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
+
# Process a single file
unless ($cmd) {
@@ -292,9 +301,44 @@
stop_time;
}
+# Extract XML files
+elsif ($cmd eq 'extract') {
+
+ pod2usage(%ERROR_HASH) unless $output;
+
+ # TODO: Support sigles and full archives
+
+ if ($output && (!-e $output || !-d $output)) {
+ print "Directory '$output' does not exist.\n\n";
+ exit(0);
+ };
+
+ if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
+
+ unless ($archive->test_unzip) {
+ print "Unzip is not installed or incompatible.\n\n";
+ exit(1);
+ };
+
+ # Test will be skipped
+
+ # Iterate over all given sigles and extract
+ foreach (@sigle) {
+ print "$_ ";
+ print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
+ print "extracted.\n";
+ };
+
+ print "\n";
+ exit(1);
+ };
+}
+
# Process an archive
elsif ($cmd eq 'archive') {
+ # TODO: Support sigles
+
pod2usage(%ERROR_HASH) unless $output;
if ($output && (!-e $output || !-d $output)) {
@@ -488,7 +532,11 @@
=item B<archive>
-Process an archive as a Zip-File or a folder of KorAP-XML documents.
+Process an archive as a Zip-file or a folder of KorAP-XML documents.
+
+=item B<extract>
+
+Extract KorAP-XML files from a Zip-file.
=back
@@ -552,6 +600,12 @@
Compress the output (expects a defined output file in single processing).
+=item B<--sigle|-sg>
+
+Extract the given text sigles.
+Currently only supported on C<extract>.
+Can be set multiple times.
+
=item B<--log|-l>
The L<Log4perl> log level, defaults to C<ERROR>.