Add wildcard support for inputs
Change-Id: I10bd8c4e8590cbdf61d4c5c77ae547adcaab93ec
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 66af16f..af1da02 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -20,6 +20,7 @@
use v5.10;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
+use File::Glob ':bsd_glob';
# use KorAP::XML::ForkPool;
# TODO: use Parallel::Loops
@@ -93,9 +94,12 @@
# 2017/02/08
# - added support for pagebreak annotations
#
+# 2017/04/06
+# - added support for wildcards in input
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2017/02/08';
+our $LAST_CHANGE = '2017/04/06';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -111,7 +115,6 @@
my (@skip, @sigle, @anno, @input);
my $text;
-
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
@@ -151,7 +154,7 @@
}
);
-$base_sentences = lc $base_sentences;
+$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
$base_pagebreaks = lc $base_pagebreaks;
@@ -186,6 +189,7 @@
$log->info("Run using $jobs jobs");
};
+
my %skip;
$skip{lc($_)} = 1 foreach @skip;
@@ -336,6 +340,22 @@
};
+# Glob files
+if (@input) {
+ my @new_input = ();
+
+ # Iterate over all inputs
+ foreach (@input) {
+ push (@new_input, bsd_glob($_));
+ };
+
+ if (scalar(@new_input) > scalar(@input)) {
+ @input = sort { length($a) <=> length($b) } @new_input;
+ print 'Input rewritten to ' . join(',', @input);
+ };
+};
+
+
# Process a single file
unless ($cmd) {
my $input = $input[0];
@@ -721,6 +741,13 @@
-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
+Input may also be defined using BSD glob wildcards.
+
+ -i 'file/news*.zip'
+
+The extended input array will be sorted in length order, so the shortest
+path needs to contain all primary data files and all meta data files.
+
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
@@ -873,123 +900,61 @@
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
-=over 2
+ Base
+ #Paragraphs
+ #Sentences
-=item B<Base>
+ Connexor
+ #Morpho
+ #Phrase
+ #Sentences
+ #Syntax
-=over 4
+ CoreNLP
+ #Constituency
+ #Morpho
+ #NamedEntities
+ #Sentences
-=item #Paragraphs
+ DeReKo
+ #Structure
-=item #Sentences
+ DRuKoLa
+ #Morpho
-=back
+ Glemm
+ #Morpho
-=item B<Connexor>
+ Malt
+ #Dependency
-=over 4
+ MarMoT
+ #Morpho
-=item #Morpho
+ Mate
+ #Dependency
+ #Morpho
-=item #Phrase
+ MDParser
+ #Dependency
-=item #Sentences
+ OpenNLP
+ #Morpho
+ #Sentences
-=item #Syntax
+ Sgbr
+ #Lemma
+ #Morpho
-=back
+ TreeTagger
+ #Morpho
+ #Sentences
-=item B<CoreNLP>
+ XIP
+ #Constituency
+ #Morpho
+ #Sentences
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #NamedEntities
-
-=item #Sentences
-
-=back
-
-=item B<DeReKo>
-
-=over 4
-
-=item #Structure
-
-=back
-
-=item B<Glemm>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<Mate>
-
-=over 4
-
-=item #Dependency
-
-=item #Morpho
-
-=back
-
-=item B<OpenNLP>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<Sgbr>
-
-=over 4
-
-=item #Lemma
-
-=item #Morpho
-
-=back
-
-=item B<DRuKoLa>
-
-=over 4
-
-=item #Morpho
-
-=back
-
-=item B<TreeTagger>
-
-=over 4
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=item B<XIP>
-
-=over 4
-
-=item #Constituency
-
-=item #Morpho
-
-=item #Sentences
-
-=back
-
-=back
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.