Added extraction method for documents in archives Change-Id: Id4ea7d9801a5750c77f81a2251d389adb6e06d31

commit: 20807585105ae0b14f64d6acdae5ba77f7ed8c44 [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Oct 26 17:11:34 2016 +0200
committer: Akron <nils@diewald-online.de> Wed Oct 26 17:11:34 2016 +0200
tree: cc0bd9b70cad7b6e00ab4e518015168b1d646c82
parent: 7606afa12ef7f5ed91d5e63b578fd446527dffa8 [diff]
diff --git a/lib/KorAP/XML/Archive.pm b/lib/KorAP/XML/Archive.pm
index 56e0622..17a4f53 100644
--- a/lib/KorAP/XML/Archive.pm
+++ b/lib/KorAP/XML/Archive.pm

@@ -59,6 +59,16 @@
 };
 
 
+# Check, if the archive has a prefix
+sub check_prefix {
+  my $self = shift;
+  my $nr = shift // 0;
+  my $file = $self->[$nr]->[0];
+  my ($header) = `unzip -l -UU -qq $file "*/header.xml"`;
+  return $header =~ m![\s\t]\.[/\\]! ? 1 : 0;
+};
+
+
 # Split a text path to prefix, corpus, document, text
 sub split_path {
   my $self = shift;
@@ -76,12 +86,12 @@
   };
 
   # Unix form
-  if ($text_path =~ m!^([^/]+?)/([^/]+?)/([^/]+?)$!) {
+  if ($text_path =~ m!^([^/]+?)/([^/]+?)[\\/]([^/]+?)$!) {
     return ($prefix, $1, $2, $3);
   }
 
   # Windows form
-  elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)\\([^\\]+?)$!) {
+  elsif ($text_path =~ m!^([^\\]+?)\\([^\\]+?)[\\/]([^\\]+?)$!) {
     return ($prefix, $1, $2, $3);
   };
 
@@ -127,9 +137,65 @@
 };
 
 
+# Extract document files to a directory
+sub extract_doc {
+  my $self = shift;
+  my $doc_path = shift;
+  my $target_dir = shift;
 
-# Extract files to a directory
-sub extract {
+  my $first = 1;
+
+  my @init_cmd = (
+    'unzip',          # Use unzip program
+    '-qo',            # quietly overwrite all existing files
+    '-d', $target_dir # Extract into target directory
+  );
+
+  my ($prefix, $corpus, $doc) = $self->split_path($doc_path . '/UNKNOWN' ) or return;
+
+  # Iterate over all attached archives
+  foreach my $archive (@$self) {
+
+    # $_ is the zip
+    my @cmd = @init_cmd;
+    push(@cmd, $archive->[0]); # Extract from zip
+
+    # Add some interesting files for extraction
+    # Can't use catfile(), as this removes the '.' prefix
+    my @breadcrumbs = ($corpus);
+
+    # If the prefix is not forbidden - prefix!
+    unshift @breadcrumbs, $prefix if ($prefix && $archive->[1]);
+
+    if ($first) {
+      # Only extract from first file
+      push(@cmd, join('/', @breadcrumbs, 'header.xml'));
+      # push(@cmd, join('/', @breadcrumbs, $doc, 'header.xml'));
+      $first = 0;
+    };
+
+    # With prefix
+    push @breadcrumbs, $doc, '*';
+
+    push(@cmd, join('/', @breadcrumbs));
+
+    # Run system call
+    system(@cmd);
+
+    # Check for return code
+    if ($? != 0) {
+      carp("System call '" . join(' ', @cmd) . "' errors " . $?);
+      return;
+    };
+  };
+
+  # Fine
+  return 1;
+};
+
+
+# Extract text files to a directory
+sub extract_text {
   my $self = shift;
   my $text_path = shift;
   my $target_dir = shift;
@@ -201,12 +267,14 @@
 
 =head1 attach
 
+=head1 check_prefix
+
 =head1 list_texts
 
 Returns all texts found in the zip file
 
-=head1 extract
+=head1 extract_text
 
-  $archive->extract('./GOE/AGU/0004', '~/temp');
+  $archive->extract_text('./GOE/AGU/0004', '~/temp');
 
 Extract all files for the named text to a certain directory.

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 6d42dbc..6ba77db 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -15,7 +15,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.21';
+our $VERSION = '0.22';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
commit	20807585105ae0b14f64d6acdae5ba77f7ed8c44	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Oct 26 17:11:34 2016 +0200
committer	Akron <nils@diewald-online.de>	Wed Oct 26 17:11:34 2016 +0200
tree	cc0bd9b70cad7b6e00ab4e518015168b1d646c82
parent	7606afa12ef7f5ed91d5e63b578fd446527dffa8 [diff]