Fix sigle-pattern option to take effect on the whole sigle
Resolves #4
Change-Id: I3514dead562acfccd6ee21c7d1f9e36af5e2fbba
diff --git a/Changes b/Changes
index 7116bf3..77350ea 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,5 @@
- korapxml2conllu:
+ - the sigle-pattern option now affects the entire sigle
- handle docid attributes correctly if they are in a different line than their parent element <layer>
0.5.0 2022-09-29
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 9cb25f8..b771474 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -126,16 +126,16 @@
$zip_content_pattern = "[sdh][tae]*";
}
if (!$baseOnly) {
- $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+ $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern |";
if ($extract_attributes_regex || !$s_bounds_from_morpho) {
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/$zip_content_pattern.xml' $zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/$zip_content_pattern.xml' $zipsiglepattern |";
} else {
$log->debug("Not reading structure information.");
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/data.xml' $zipsiglepattern |";
}
} else {
$foundry = "base";
- $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern";
+ $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern";
if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
$morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
} else {
@@ -143,9 +143,12 @@
}
$morphoOrTokenCommand =~ s/-l/-c/;
$morphoOrTokenCommand .= ' |';
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*/${sigle_pattern}*/*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*${sigle_pattern}*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
}
+ $log->debug("command to extract annotation and/or tokenization: $morphoOrTokenCommand");
+ $log->debug("command to extract plain text and structure: $plaintextAndStructureCommand");
+
open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
print "$COMMENT_START foundry = $foundry$COMMENT_END\n" if ($comments);
diff --git a/t/test.t b/t/test.t
index ac3550c..112846d 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More tests => 53;
+use Test::More tests => 59;
use Test::Script;
use Test::TempDir::Tiny;
use File::Copy;
@@ -203,4 +203,13 @@
script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data");
script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)");
script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations");
+
+script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "KOT", "t/data/nkjp.zip" ], "Runs korapxml2conllu with --sigle-pattern option on combined base/morpho files");
+script_stdout_like("NKJP/NKJP/KOT/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern extracts the right texts");
+script_stdout_unlike("NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern does not extract the wrong texts");
+
+script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "13072", "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with --sigle-pattern option on seprate base/morpho files");
+script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts");
+script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts");
+
done_testing;