Add wildcard support for inputs

Change-Id: I10bd8c4e8590cbdf61d4c5c77ae547adcaab93ec
diff --git a/Readme.pod b/Readme.pod
index 5fab72d..e1f1d8f 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -74,6 +74,13 @@
 
   -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
 
+Input may also be defined using BSD glob wildcards.
+
+  -i 'file/news*.zip'
+
+The extended input array will be sorted in length order, so the shortest
+path needs to contain all primary data files and all meta data files.
+
 (The directory structure follows the base directory format,
 that may include a C<.> root folder.
 In this case further archives lacking a C<.> root folder
@@ -122,6 +129,14 @@
  Defaults to unset.
 
 
+=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
+
+Define the layer for base pagebreaks.
+Currently C<DeReKo#Structure> is the only layer supported.
+
+ Defaults to unset.
+
+
 =item B<--skip|-s> <foundry>[#<layer>]
 
 Skip specific annotations by specifying the foundry
@@ -147,6 +162,8 @@
 Define the number of concurrent jobs in seperated forks
 for archive processing.
 Defaults to C<0> (everything runs in a single process).
+Pass -1, and the value will be set automatically to 5
+times the number of available cores.
 This is I<experimental>.
 
 =item B<--meta|-m>
@@ -216,115 +233,61 @@
 The base foundry with paragraphs, sentences, and the text element are mandatory for
 L<Krill|https://github.com/KorAP/Krill>.
 
-=over 2
+  Base
+    #Paragraphs
+    #Sentences
 
-=item B<Base>
+  Connexor
+    #Morpho
+    #Phrase
+    #Sentences
+    #Syntax
 
-=over 4
+  CoreNLP
+    #Constituency
+    #Morpho
+    #NamedEntities
+    #Sentences
 
-=item #Paragraphs
+  DeReKo
+    #Structure
 
-=item #Sentences
+  DRuKoLa
+    #Morpho
 
-=back
+  Glemm
+    #Morpho
 
-=item B<Connexor>
+  Malt
+    #Dependency
 
-=over 4
+  MarMoT
+    #Morpho
 
-=item #Morpho
+  Mate
+    #Dependency
+    #Morpho
 
-=item #Phrase
+  MDParser
+    #Dependency
 
-=item #Sentences
+  OpenNLP
+    #Morpho
+    #Sentences
 
-=item #Syntax
+  Sgbr
+    #Lemma
+    #Morpho
 
-=back
+  TreeTagger
+    #Morpho
+    #Sentences
 
-=item B<CoreNLP>
+  XIP
+    #Constituency
+    #Morpho
+    #Sentences
 
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #NamedEntities
-
-=item #Sentences
-
-=back
-
-=item B<DeReKo>
-
-=over 4
-
-=item #Structure
-
-=back
-
-=item B<Glemm>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<Mate>
-
-=over 4
-
-=item #Dependency
-
-=item #Morpho
-
-=back
-
-=item B<OpenNLP>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<Sgbr>
-
-=over 4
-
-=item #Lemma
-
-=item #Morpho
-
-=back
-
-=item B<TreeTagger>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<XIP>
-
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=back
 
 More importers are in preparation.
 New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
@@ -337,7 +300,7 @@
 
 =head1 COPYRIGHT AND LICENSE
 
-Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|http://nils-diewald.de/>
 Contributor: Eliza Margaretha
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 08bd56c..1435796 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
 
-our $VERSION = '0.25';
+our $VERSION = '0.26';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 66af16f..af1da02 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -20,6 +20,7 @@
 use v5.10;
 use Sys::Info;
 use Sys::Info::Constants qw( :device_cpu );
+use File::Glob ':bsd_glob';
 
 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
@@ -93,9 +94,12 @@
 # 2017/02/08
 # - added support for pagebreak annotations
 #
+# 2017/04/06
+# - added support for wildcards in input
+#
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2017/02/08';
+our $LAST_CHANGE = '2017/04/06';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -111,7 +115,6 @@
 my (@skip, @sigle, @anno, @input);
 my $text;
 
-
 # Parse options from the command line
 GetOptions(
   'input|i=s'   => \@input,
@@ -151,7 +154,7 @@
   }
 );
 
-$base_sentences = lc $base_sentences;
+$base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;
 
@@ -186,6 +189,7 @@
   $log->info("Run using $jobs jobs");
 };
 
+
 my %skip;
 $skip{lc($_)} = 1 foreach @skip;
 
@@ -336,6 +340,22 @@
 };
 
 
+# Glob files
+if (@input) {
+  my @new_input = ();
+
+  # Iterate over all inputs
+  foreach (@input) {
+    push (@new_input, bsd_glob($_));
+  };
+
+  if (scalar(@new_input) > scalar(@input)) {
+    @input = sort { length($a) <=> length($b) } @new_input;
+    print 'Input rewritten to ' . join(',', @input);
+  };
+};
+
+
 # Process a single file
 unless ($cmd) {
   my $input = $input[0];
@@ -721,6 +741,13 @@
 
   -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
 
+Input may also be defined using BSD glob wildcards.
+
+  -i 'file/news*.zip'
+
+The extended input array will be sorted in length order, so the shortest
+path needs to contain all primary data files and all meta data files.
+
 (The directory structure follows the base directory format,
 that may include a C<.> root folder.
 In this case further archives lacking a C<.> root folder
@@ -873,123 +900,61 @@
 The base foundry with paragraphs, sentences, and the text element are mandatory for
 L<Krill|https://github.com/KorAP/Krill>.
 
-=over 2
+  Base
+    #Paragraphs
+    #Sentences
 
-=item B<Base>
+  Connexor
+    #Morpho
+    #Phrase
+    #Sentences
+    #Syntax
 
-=over 4
+  CoreNLP
+    #Constituency
+    #Morpho
+    #NamedEntities
+    #Sentences
 
-=item #Paragraphs
+  DeReKo
+    #Structure
 
-=item #Sentences
+  DRuKoLa
+    #Morpho
 
-=back
+  Glemm
+    #Morpho
 
-=item B<Connexor>
+  Malt
+    #Dependency
 
-=over 4
+  MarMoT
+    #Morpho
 
-=item #Morpho
+  Mate
+    #Dependency
+    #Morpho
 
-=item #Phrase
+  MDParser
+    #Dependency
 
-=item #Sentences
+  OpenNLP
+    #Morpho
+    #Sentences
 
-=item #Syntax
+  Sgbr
+    #Lemma
+    #Morpho
 
-=back
+  TreeTagger
+    #Morpho
+    #Sentences
 
-=item B<CoreNLP>
+  XIP
+    #Constituency
+    #Morpho
+    #Sentences
 
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #NamedEntities
-
-=item #Sentences
-
-=back
-
-=item B<DeReKo>
-
-=over 4
-
-=item #Structure
-
-=back
-
-=item B<Glemm>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<Mate>
-
-=over 4
-
-=item #Dependency
-
-=item #Morpho
-
-=back
-
-=item B<OpenNLP>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<Sgbr>
-
-=over 4
-
-=item #Lemma
-
-=item #Morpho
-
-=back
-
-=item B<DRuKoLa>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<TreeTagger>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<XIP>
-
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=back
 
 More importers are in preparation.
 New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
diff --git a/t/script/archive.t b/t/script/archive.t
index d3853f4..17ab672 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -5,6 +5,7 @@
 use File::Spec::Functions qw/catdir catfile/;
 use File::Temp qw/tempdir/;
 use Mojo::File;
+use Mojo::Util qw/quote/;
 use Mojo::JSON qw/decode_json/;
 use IO::Uncompress::Gunzip;
 use Test::More;
@@ -47,7 +48,7 @@
   ' ',
   'perl', $script,
   'archive',
-  '--input' => $input,
+  '--input' => '' . $input,
   '--output' => $output,
   '-t' => 'Base#tokens_aggr',
   '-m' => 'Sgbr'
@@ -167,5 +168,26 @@
 
 unlink($output);
 
+
+$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+$call = join(
+  ' ',
+  'perl', $script,
+  'archive',
+  '--input' => $input_quotes,
+  '--output' => $output,
+  '-t' => 'Base#tokens_aggr'
+);
+
+# Test without parameters
+stdout_like(
+  sub {
+    system($call);
+  },
+  qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+  $call
+);
+
 done_testing;
 __END__