Fix sigle-pattern option to take effect on the whole sigle

Resolves #4

Change-Id: I3514dead562acfccd6ee21c7d1f9e36af5e2fbba
diff --git a/Changes b/Changes
index 7116bf3..77350ea 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,5 @@
         - korapxml2conllu:
+            - the sigle-pattern option now affects the entire sigle
             - handle docid attributes correctly if they are in a different line than their parent element <layer>
 
 0.5.0 2022-09-29
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 9cb25f8..b771474 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -126,16 +126,16 @@
     $zip_content_pattern = "[sdh][tae]*";
   }
   if (!$baseOnly) {
-    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern |";
     if ($extract_attributes_regex || !$s_bounds_from_morpho) {
-      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/$zip_content_pattern.xml' $zipsiglepattern |";
+      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/$zip_content_pattern.xml' $zipsiglepattern |";
     } else {
       $log->debug("Not reading structure information.");
-      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
+      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/data.xml' $zipsiglepattern |";
     }
   } else {
     $foundry = "base";
-    $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern";
+    $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern";
     if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
       $morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
     } else {
@@ -143,9 +143,12 @@
     }
     $morphoOrTokenCommand =~ s/-l/-c/;
     $morphoOrTokenCommand .= ' |';
-    $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*/${sigle_pattern}*/*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
+    $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*${sigle_pattern}*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
   }
 
+  $log->debug("command to extract annotation and/or tokenization: $morphoOrTokenCommand");
+  $log->debug("command to extract plain text and structure: $plaintextAndStructureCommand");
+
   open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
   open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
   print "$COMMENT_START foundry = $foundry$COMMENT_END\n" if ($comments);
diff --git a/t/test.t b/t/test.t
index ac3550c..112846d 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 53;
+use Test::More tests => 59;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -203,4 +203,13 @@
 script_runs([ 'script/korapxml2conllu', "t/data/nkjp.zip" ], "Runs korapxml2conllu on nkjp test data");
 script_stderr_unlike("Use of uninitialized value", "Handles lonely docid parameters (line separated from layer elements)");
 script_stdout_like("\n9\twesołości\twesołość\tsubst\tsubst\tsg:gen:f", "Correctly converts nkjp annotations");
+
+script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "KOT", "t/data/nkjp.zip" ], "Runs korapxml2conllu with --sigle-pattern option on combined base/morpho files");
+script_stdout_like("NKJP/NKJP/KOT/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern extracts the right texts");
+script_stdout_unlike("NKJP/NKJP/KolakowskiOco/nkjp/morpho.xml", "--sigle-pattern to specify a doc sigle pattern does not extract the wrong texts");
+
+script_runs([ 'script/korapxml2conllu', "--sigle-pattern", "13072", "t/data/wdf19.tree_tagger.zip" ], "Runs korapxml2conllu with --sigle-pattern option on seprate base/morpho files");
+script_stdout_like("WDF19/A0000/13072/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern extracts the right texts");
+script_stdout_unlike("WDF19/A0000/14247/tree_tagger/morpho.xml", "--sigle-pattern to specify a text sigle pattern does not extract the wrong texts");
+
 done_testing;