Add wildcard support for inputs
Change-Id: I10bd8c4e8590cbdf61d4c5c77ae547adcaab93ec
diff --git a/Readme.pod b/Readme.pod
index 5fab72d..e1f1d8f 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -74,6 +74,13 @@
-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
+Input may also be defined using BSD glob wildcards.
+
+ -i 'file/news*.zip'
+
+The extended input array will be sorted in length order, so the shortest
+path needs to contain all primary data files and all meta data files.
+
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
@@ -122,6 +129,14 @@
Defaults to unset.
+=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
+
+Define the layer for base pagebreaks.
+Currently C<DeReKo#Structure> is the only layer supported.
+
+ Defaults to unset.
+
+
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
@@ -147,6 +162,8 @@
Define the number of concurrent jobs in seperated forks
for archive processing.
Defaults to C<0> (everything runs in a single process).
+Pass -1, and the value will be set automatically to 5
+times the number of available cores.
This is I<experimental>.
=item B<--meta|-m>
@@ -216,115 +233,61 @@
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
-=over 2
+ Base
+ #Paragraphs
+ #Sentences
-=item B<Base>
+ Connexor
+ #Morpho
+ #Phrase
+ #Sentences
+ #Syntax
-=over 4
+ CoreNLP
+ #Constituency
+ #Morpho
+ #NamedEntities
+ #Sentences
-=item #Paragraphs
+ DeReKo
+ #Structure
-=item #Sentences
+ DRuKoLa
+ #Morpho
-=back
+ Glemm
+ #Morpho
-=item B<Connexor>
+ Malt
+ #Dependency
-=over 4
+ MarMoT
+ #Morpho
-=item #Morpho
+ Mate
+ #Dependency
+ #Morpho
-=item #Phrase
+ MDParser
+ #Dependency
-=item #Sentences
+ OpenNLP
+ #Morpho
+ #Sentences
-=item #Syntax
+ Sgbr
+ #Lemma
+ #Morpho
-=back
+ TreeTagger
+ #Morpho
+ #Sentences
-=item B<CoreNLP>
+ XIP
+ #Constituency
+ #Morpho
+ #Sentences
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #NamedEntities
-
-=item #Sentences
-
-=back
-
-=item B<DeReKo>
-
-=over 4
-
-=item #Structure
-
-=back
-
-=item B<Glemm>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<Mate>
-
-=over 4
-
-=item #Dependency
-
-=item #Morpho
-
-=back
-
-=item B<OpenNLP>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<Sgbr>
-
-=over 4
-
-=item #Lemma
-
-=item #Morpho
-
-=back
-
-=item B<TreeTagger>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<XIP>
-
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=back
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
@@ -337,7 +300,7 @@
=head1 COPYRIGHT AND LICENSE
-Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
Contributor: Eliza Margaretha
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 08bd56c..1435796 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
-our $VERSION = '0.25';
+our $VERSION = '0.26';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 66af16f..af1da02 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -20,6 +20,7 @@
use v5.10;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
+use File::Glob ':bsd_glob';
# use KorAP::XML::ForkPool;
# TODO: use Parallel::Loops
@@ -93,9 +94,12 @@
# 2017/02/08
# - added support for pagebreak annotations
#
+# 2017/04/06
+# - added support for wildcards in input
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2017/02/08';
+our $LAST_CHANGE = '2017/04/06';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -111,7 +115,6 @@
my (@skip, @sigle, @anno, @input);
my $text;
-
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
@@ -151,7 +154,7 @@
}
);
-$base_sentences = lc $base_sentences;
+$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
$base_pagebreaks = lc $base_pagebreaks;
@@ -186,6 +189,7 @@
$log->info("Run using $jobs jobs");
};
+
my %skip;
$skip{lc($_)} = 1 foreach @skip;
@@ -336,6 +340,22 @@
};
+# Glob files
+if (@input) {
+ my @new_input = ();
+
+ # Iterate over all inputs
+ foreach (@input) {
+ push (@new_input, bsd_glob($_));
+ };
+
+ if (scalar(@new_input) > scalar(@input)) {
+ @input = sort { length($a) <=> length($b) } @new_input;
+ print 'Input rewritten to ' . join(',', @input);
+ };
+};
+
+
# Process a single file
unless ($cmd) {
my $input = $input[0];
@@ -721,6 +741,13 @@
-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
+Input may also be defined using BSD glob wildcards.
+
+ -i 'file/news*.zip'
+
+The extended input array will be sorted in length order, so the shortest
+path needs to contain all primary data files and all meta data files.
+
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
@@ -873,123 +900,61 @@
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
-=over 2
+ Base
+ #Paragraphs
+ #Sentences
-=item B<Base>
+ Connexor
+ #Morpho
+ #Phrase
+ #Sentences
+ #Syntax
-=over 4
+ CoreNLP
+ #Constituency
+ #Morpho
+ #NamedEntities
+ #Sentences
-=item #Paragraphs
+ DeReKo
+ #Structure
-=item #Sentences
+ DRuKoLa
+ #Morpho
-=back
+ Glemm
+ #Morpho
-=item B<Connexor>
+ Malt
+ #Dependency
-=over 4
+ MarMoT
+ #Morpho
-=item #Morpho
+ Mate
+ #Dependency
+ #Morpho
-=item #Phrase
+ MDParser
+ #Dependency
-=item #Sentences
+ OpenNLP
+ #Morpho
+ #Sentences
-=item #Syntax
+ Sgbr
+ #Lemma
+ #Morpho
-=back
+ TreeTagger
+ #Morpho
+ #Sentences
-=item B<CoreNLP>
+ XIP
+ #Constituency
+ #Morpho
+ #Sentences
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #NamedEntities
-
-=item #Sentences
-
-=back
-
-=item B<DeReKo>
-
-=over 4
-
-=item #Structure
-
-=back
-
-=item B<Glemm>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<Mate>
-
-=over 4
-
-=item #Dependency
-
-=item #Morpho
-
-=back
-
-=item B<OpenNLP>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<Sgbr>
-
-=over 4
-
-=item #Lemma
-
-=item #Morpho
-
-=back
-
-=item B<DRuKoLa>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<TreeTagger>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<XIP>
-
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=back
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
diff --git a/t/script/archive.t b/t/script/archive.t
index d3853f4..17ab672 100644
--- a/t/script/archive.t
+++ b/t/script/archive.t
@@ -5,6 +5,7 @@
use File::Spec::Functions qw/catdir catfile/;
use File::Temp qw/tempdir/;
use Mojo::File;
+use Mojo::Util qw/quote/;
use Mojo::JSON qw/decode_json/;
use IO::Uncompress::Gunzip;
use Test::More;
@@ -47,7 +48,7 @@
' ',
'perl', $script,
'archive',
- '--input' => $input,
+ '--input' => '' . $input,
'--output' => $output,
'-t' => 'Base#tokens_aggr',
'-m' => 'Sgbr'
@@ -167,5 +168,26 @@
unlink($output);
+
+$input_quotes = "'".catfile($f, '..', 'corpus', 'archives', 'wpd15*.zip') . "'";
+
+$call = join(
+ ' ',
+ 'perl', $script,
+ 'archive',
+ '--input' => $input_quotes,
+ '--output' => $output,
+ '-t' => 'Base#tokens_aggr'
+);
+
+# Test without parameters
+stdout_like(
+ sub {
+ system($call);
+ },
+ qr!Input rewritten to .+?wpd15-single\.zip,.+?wpd15-single\.malt\.zip,.+?wpd15-single\.corenlp\.zip,.+?wpd15-single\.opennlp\.zip,.+?wpd15-single\.mdparser\.zip,.+?wpd15-single\.tree_tagger\.zip!is,
+ $call
+);
+
done_testing;
__END__