Added extraction method for documents in archives Change-Id: Id4ea7d9801a5750c77f81a2251d389adb6e06d31

commit: 20807585105ae0b14f64d6acdae5ba77f7ed8c44 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Oct 26 17:11:34 2016 +0200
committer: Akron <nils@diewald-online.de> Wed Oct 26 17:11:34 2016 +0200
tree: cc0bd9b70cad7b6e00ab4e518015168b1d646c82
parent: 7606afa12ef7f5ed91d5e63b578fd446527dffa8 [diff] [blame]
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 2713342..d00ba9e 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -332,6 +332,40 @@
         # TODO: Make this OS independent
         push @sigle, join '/', $corpus, $doc, $text;
       };
+    }
+
+    # Check sigle for doc sigles
+    else {
+      my @new_sigle;
+
+      my $prefix_check = 0;
+
+      # Iterate over all sigle
+      foreach (@sigle) {
+
+        # Sigle is a doc sigle
+        if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
+          print "$_ ";
+
+          # Check if a prefix is needed
+          unless ($prefix_check) {
+            $prefix = $archive->check_prefix;
+            $prefix_check = 1;
+          };
+
+          # TODO: Make this OS independent
+          print '' . (
+            $archive->extract_doc(
+              ($prefix ? './' : '') . $_, $output
+            ) ? '' : 'not '
+          );
+          print "extracted.\n";
+        }
+        else {
+          push @new_sigle, $_;
+        };
+      };
+      @sigle = @new_sigle;
     };
 
     # Iterate over all given sigles and extract
@@ -340,7 +374,7 @@
 
       # TODO: Make this OS independent
       print '' . (
-        $archive->extract(
+        $archive->extract_text(
           ($prefix ? './' : '') . $_, $output
         ) ? '' : 'not '
       );
@@ -474,7 +508,7 @@
       # because extraction can be horrible slow!
 
       # Extract from archive
-      if ($archive->extract($dirs[$i], $temp)) {
+      if ($archive->extract_text($dirs[$i], $temp)) {
 
         # Create corpus directory
         my $input = catdir("$temp", $corpus);
@@ -533,8 +567,8 @@
 =head1 SYNOPSIS
 
   $ korapxml2krill -z --input <directory> --output <filename>
+  $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
   $ korapxml2krill archive -z --input <directory|archive> --output <directory>
-  $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
 
 
 =head1 DESCRIPTION
@@ -684,10 +718,11 @@
 
 =item B<--sigle|-sg>
 
-Extract the given text sigles.
+Extract the given texts.
 Can be set multiple times.
 I<Currently only supported on C<extract>.>
 Sigles have the structure C<Corpus>/C<Document>/C<Text>.
+In case the C<Text> path is omitted, the whole document will be extracted.
 
 =item B<--log|-l>
commit	20807585105ae0b14f64d6acdae5ba77f7ed8c44	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Oct 26 17:11:34 2016 +0200
committer	Akron <nils@diewald-online.de>	Wed Oct 26 17:11:34 2016 +0200
tree	cc0bd9b70cad7b6e00ab4e518015168b1d646c82
parent	7606afa12ef7f5ed91d5e63b578fd446527dffa8 [diff] [blame]