Fix temporary-extract configuration parameter
Change-Id: Iba7cd0e07bc1a262a8d78ffc37d98ce5299fb2b6
diff --git a/Changes b/Changes
index c2ab339..0ab3365 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.50 2023-02-13
+ - Fix 'temporary-extract' configuration
+ information.
+
0.49 2023-02-12
- Support for UDPipe POS, lemma and dependency
annotations (kupietz).
diff --git a/Readme.pod b/Readme.pod
index 2b4d1c6..bae4e4a 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -207,7 +207,7 @@
for archive processing.
Defaults to C<0> (everything runs in a single process).
-If C<sequential-extraction> is not set to false, this will
+If C<sequential-extraction> is not set to true, this will
also apply to extraction.
Pass -1, and the value will be set automatically to 5
@@ -299,7 +299,8 @@
=item B<--temporary-extract|-te>
-Only valid for the C<archive> command.
+Only valid for the C<archive> and C<serial>
+commands.
This will first extract all files into a
directory and then will archive.
diff --git a/lib/KorAP/XML/ForkPool.pm b/lib/KorAP/XML/ForkPool.pm
index f9b11ae..b3acef8 100644
--- a/lib/KorAP/XML/ForkPool.pm
+++ b/lib/KorAP/XML/ForkPool.pm
@@ -18,7 +18,9 @@
}, $class;
};
-sub new_pool {
+
+# Create new fork pool
+sub _new_pool {
my $self = shift;
# Zero means: everything runs in the parent process
@@ -39,11 +41,13 @@
return $pool;
};
+
+# Iterate over a directory and process all documents
sub process_directory {
my $self = shift;
my $input = shift;
- my $pool = $self->new_pool;
+ my $pool = $self->_new_pool;
print "Reading data ...\n";
@@ -91,6 +95,7 @@
};
+# Take an archive, uncompress and iterate over all texts
sub process_archive {
my $self = shift;
my $archive = shift;
@@ -110,7 +115,7 @@
$self->{count} = scalar @dirs;
# Creae new pool
- my $pool = $self->new_pool;
+ my $pool = $self->_new_pool;
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index c5f68a1..95218e0 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -16,7 +16,7 @@
our @EXPORT_OK = qw(get_file_name get_file_name_from_glob);
-our $VERSION = '0.49';
+our $VERSION = '0.50';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 75dd21d..0856c57 100755
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -167,9 +167,13 @@
#
# 2023/02/05
# - Support for UD
+#
+# 2023/02/13
+# - Fix temporary-extract handling from configuration file.
+#
# ----------------------------------------------------------
-our $LAST_CHANGE = '2023/02/05';
+our $LAST_CHANGE = '2023/02/13';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
@@ -201,7 +205,7 @@
'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
'gzip|z' => \($cfg{gzip}),
- 'temporary-extract|te=s' => \($cfg{extract_dir}),
+ 'temporary-extract|te=s' => \($cfg{temporary_extract}),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \($cfg{cache_file}),
@@ -290,7 +294,7 @@
my $input_base = $cfg{input_base};
my $gzip = $cfg{gzip};
my $to_tar = $cfg{to_tar};
-my $extract_dir = $cfg{extract_dir};
+my $extract_dir = $cfg{temporary_extract};
my $token_base = $cfg{token} // 'OpenNLP#tokens';
my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
my $jobs = $cfg{jobs} // 0;
@@ -392,6 +396,7 @@
# Create archive command
my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
print "Start serial processing of $_ to $new_out\n";
+ print 'Command: ', join(' ', @archive_cmd), "\n";
# Start archiving
system @archive_cmd;
@@ -717,6 +722,7 @@
# Extract to temporary directory
if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
+ print "Extract sequentially to $extract_dir\n";
@input = ($extract_dir);
}
else {
@@ -1269,7 +1275,7 @@
for archive processing.
Defaults to C<0> (everything runs in a single process).
-If C<sequential-extraction> is not set to false, this will
+If C<sequential-extraction> is not set to true, this will
also apply to extraction.
Pass -1, and the value will be set automatically to 5
@@ -1361,7 +1367,8 @@
=item B<--temporary-extract|-te>
-Only valid for the C<archive> command.
+Only valid for the C<archive> and C<serial>
+commands.
This will first extract all files into a
directory and then will archive.
diff --git a/t/script/config.t b/t/script/config.t
index 5ec57cb..bd6e9fa 100644
--- a/t/script/config.t
+++ b/t/script/config.t
@@ -24,6 +24,9 @@
my $input_base = catdir($f, '..', 'corpus', 'archives');
+# Temporary extract
+my $temp_out = File::Temp->newdir(CLEANUP => 0);
+
print $fh <<"CFG";
overwrite 0
token OpenNLP#tokens
@@ -34,6 +37,8 @@
meta I5
gzip 1
log DEBUG
+temporary-extract $temp_out
+sequential-extraction 1
input-base $input_base
CFG
@@ -69,6 +74,8 @@
# Processed using gzip
like($stdout, qr!Processed .+?WPD15-A00-00081\.json\.gz!, 'Gzip');
+like($stdout, qr!Extract sequentially to!);
+
# Check log level
like($stdout, qr!Unable to parse KorAP::XML::Annotation::Glemm::Morpho!, 'Check log level');