Fixed tempdir issue in script
Change-Id: I8421bd0c83629350ef4d6efce8fbb9fce22fab4e
diff --git a/Changes b/Changes
index 8a28f26..e0477b1 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.09 2016-02-15
+ - Fixed temporary directory handling in scripts.
+ - Improved skipping for archive handling in scripts.
+
0.08 2016-02-14
- Added support for archive streaming.
- Improved scripts.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index decae35..c11588c 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -17,7 +17,7 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.08';
+our $VERSION = '0.09';
our @ATTR = qw/text_sigle
doc_sigle
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 9e2d1e8..6443c8a 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -40,9 +40,9 @@
--output|-o <filename> Document name for output (optional),
Writes to <STDOUT> by default
--overwrite|-w Overwrite files that already exist
- --token|-t <foundry>[#<layer>] Define the default tokenization by specifying
+ --token|-t <foundry>[#<file>] Define the default tokenization by specifying
the name of the foundry and optionally the name
- of the layer. Defaults to OpenNLP#tokens.
+ of the layer-file. Defaults to OpenNLP#tokens.
--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
or specific layers by defining the name
with a # in front of the foundry,
@@ -61,7 +61,7 @@
--help|-h Print this document (optional)
--version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/14
+diewald@ids-mannheim.de, 2016/02/15
EOHELP
exit(defined $_[0] ? $_[0] : 0);
diff --git a/script/korapxml2krill_dir b/script/korapxml2krill_dir
index 5b09566..6293a7d 100644
--- a/script/korapxml2krill_dir
+++ b/script/korapxml2krill_dir
@@ -24,6 +24,11 @@
#
# 2016/02/14
# - Added version information
+# - Added support for archive files
+#
+# 2016/02/15
+# - Fixed temporary directory bug
+# - Improved skipping before unzipping
sub printversion {
print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
@@ -63,7 +68,7 @@
--help|-h Print this document (optional)
--version|-v Print version information
-diewald@ids-mannheim.de, 2016/02/14
+diewald@ids-mannheim.de, 2016/02/15
EOHELP
@@ -90,14 +95,18 @@
printhelp(1) if !$input || !$output;
+sub get_file_name {
+ my $file = shift;
+ $file =~ s/^?\/?$input//;
+ $file =~ tr/\//-/;
+ $file =~ s{^-+}{};
+ return $file;
+};
# write file
sub write_file {
my $anno = shift;
- my $file = $anno;
- $file =~ s/^?\/?$input//;
- $file =~ tr/\//-/;
- $file =~ s{^-+}{};
+ my $file = get_file_name($anno);
my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
@@ -154,14 +163,26 @@
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
+ unless ($overwrite) {
+
+ my $filename = catfile(
+ $output,
+ get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
+ );
+ if (-e $filename) {
+ print "Skip $filename\n";
+ next;
+ };
+ };
+
# Create temporary file
- my $temp = tempdir(CLEANUP => 1);
+ my $temp = File::Temp->newdir;
# Extract from archive
if ($archive->extract($dirs[$i], $temp)) {
# Create corpus directory
- $input = catdir($temp, $corpus);
+ $input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
@@ -173,7 +194,7 @@
print "Unable to extract " . $dirs[$i] . "\n";
};
- $temp = 0;
+ $temp = undef;
};
}