Introduced sequential extraction flag to circumvent troubles with parallel extraction Change-Id: I7fb50e60ff527c7e79b59fac3cf957fdf4b989ac

commit: 9ec8887163c4f0ec9ae012948516fdcd2cb6ad3d [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Apr 12 16:29:06 2017 +0200
committer: Akron <nils@diewald-online.de> Wed Apr 12 16:29:06 2017 +0200
tree: a0f2e4efd9e9bbcabbb619a031b11539ed258071
parent: 3a486f8827bff7c9a3f04b328dc87787add9cc21 [diff] [blame]
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 00e9216..1b994c2 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -106,12 +106,13 @@
 # - support configuration option
 # - support for temporary extraction
 #
-# 2017/04/10
+# 2017/04/12
 # - support serial processing
 # - support input root
+# - introduced --sequential-extraction flag
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2017/04/07';
+our $LAST_CHANGE = '2017/04/12';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -154,6 +155,7 @@
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
+  'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
   'cache-init|ci!'   => \(my $cache_init),
@@ -231,6 +233,11 @@
     $cache_init = $config{'cache-init'} ;
   };
 
+  # Jobs for extraction
+  if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
+    $sequential_extraction = $config{'sequential-extraction'} ;
+  };
+
   # Meta
   if (!(defined $meta) && defined $config{'meta'}) {
     $meta = $config{'meta'} ;
@@ -279,16 +286,17 @@
 
 
 # Set default token base
-$token_base      //= 'OpenNLP#tokens';
-$cache_file      //= 'korapxml2krill.cache';
-$cache_size      //= '50m';
-$jobs            //= 0;
-$cache_delete    //= 1;
-$cache_init      //= 1;
-$log_level       //= 'ERROR';
-$base_sentences  //= '';
-$base_paragraphs //= '';
-$base_pagebreaks //= '';
+$token_base          //= 'OpenNLP#tokens';
+$cache_file          //= 'korapxml2krill.cache';
+$cache_size          //= '50m';
+$jobs                //= 0;
+$cache_delete        //= 1;
+$cache_init          //= 1;
+$sequential_extraction //= 0;
+$log_level           //= 'ERROR';
+$base_sentences      //= '';
+$base_paragraphs     //= '';
+$base_pagebreaks     //= '';
 
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
@@ -524,13 +532,13 @@
 
 sub get_file_name_from_glob ($) {
   my $glob = shift;
-  $glob =~ s/\.zip$//;          # Remove file extension
   $glob =~ s![\\\/]!-!g;        # Transform paths
   $glob =~ s/[\*\?]//g;         # Remove arbitrary fills
   $glob =~ s/[\{\}\[\]]/-/g;    # Remove class and multiple brackets
   $glob =~ s/\-\-+/-/g;         # Remove sequences of binding characters
   $glob =~ s/^-//;              # Clean beginning
   $glob =~ s/-$//;              # Clean end
+  $glob =~ s/\.zip$//;          # Remove file extension
   return $glob;
 };
 
@@ -660,7 +668,7 @@
 
           print '... ' . (
             $archive->extract_doc(
-              $path, $output, $jobs
+              $path, $output, $sequential_extraction ? 1 : $jobs
             ) ? '' : 'not '
           );
           print "extracted.\n";
@@ -736,7 +744,7 @@
       $extract_dir = catdir($extract_dir, random_string('cccccc'));
 
       # Extract to temprary directory
-      if ($archive->extract_all($extract_dir, $jobs)) {
+      if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
         @input = ($extract_dir);
       }
       else {
@@ -1104,10 +1112,22 @@
 Define the number of concurrent jobs in seperated forks
 for archive processing.
 Defaults to C<0> (everything runs in a single process).
+
+If C<sequential-extraction> is not set to false, this will
+also apply to extraction.
+
 Pass -1, and the value will be set automatically to 5
 times the number of available cores.
 This is I<experimental>.
 
+=item B<--sequential-extraction|-se>
+
+Flag to indicate, if the C<jobs> value also applies to extraction.
+Some systems may have problems with extracting multiple archives
+to the same folder at the same time.
+Can be flagged using C<--no-sequential-extraction> as well.
+Defaults to C<false>.
+
 =item B<--meta|-m>
 
 Define the metadata parser to use. Defaults to C<I5>.
@@ -1157,8 +1177,11 @@
 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
-C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
-C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
+C<output>,
+C<temp-extract>, C<sequential-extraction>,
+C<base-sentences>, C<base-paragraphs>,
+C<base-pagebreaks>,
+C<skip> (semicolon separated), C<sigle>
 (semicolon separated), C<anno> (semicolon separated).
 
 =item B<--temporary-extract|-te>
commit	9ec8887163c4f0ec9ae012948516fdcd2cb6ad3d	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Apr 12 16:29:06 2017 +0200
committer	Akron <nils@diewald-online.de>	Wed Apr 12 16:29:06 2017 +0200
tree	a0f2e4efd9e9bbcabbb619a031b11539ed258071
parent	3a486f8827bff7c9a3f04b328dc87787add9cc21 [diff] [blame]