Fix sigle-pattern option to take effect on the whole sigle
Resolves #4
Change-Id: I3514dead562acfccd6ee21c7d1f9e36af5e2fbba
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 9cb25f8..b771474 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -126,16 +126,16 @@
$zip_content_pattern = "[sdh][tae]*";
}
if (!$baseOnly) {
- $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+ $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern |";
if ($extract_attributes_regex || !$s_bounds_from_morpho) {
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/$zip_content_pattern.xml' $zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/$zip_content_pattern.xml' $zipsiglepattern |";
} else {
$log->debug("Not reading structure information.");
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/data.xml' $zipsiglepattern |";
}
} else {
$foundry = "base";
- $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern";
+ $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern";
if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
$morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
} else {
@@ -143,9 +143,12 @@
}
$morphoOrTokenCommand =~ s/-l/-c/;
$morphoOrTokenCommand .= ' |';
- $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*/${sigle_pattern}*/*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
+ $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*${sigle_pattern}*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
}
+ $log->debug("command to extract annotation and/or tokenization: $morphoOrTokenCommand");
+ $log->debug("command to extract plain text and structure: $plaintextAndStructureCommand");
+
open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
print "$COMMENT_START foundry = $foundry$COMMENT_END\n" if ($comments);