Check for unzip -c meta output more rigorously

Change-Id: I110f9631a3e7c53ba0bb28cd995b6a3023cb6cf5
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index a8bcf00..865224e 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -71,7 +71,7 @@
   open (PLAINTEXTPIPE, "unzip -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |") or die "cannot unzip $data_zip";
   print "$COMMENT_START foundry = $foundry\n";
   while (<MORPHOPIPE>) {
-    if (/\s+inflating:\s+(.*)/) {
+    if (/^  inflating: (.*)/) {
       $filename=$1;
       while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
         print STDERR "WARNING: $filename already processed\n";
@@ -80,7 +80,7 @@
         }
         $filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
       }
-    } elsif(m@<layer\s+.*docid="([^"]+)"@) {
+    } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
       last if($test && $text_no++ > 3);
       if(!$first) {
         closeDoc(0);
@@ -100,7 +100,7 @@
       }
       print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
       print STDERR "Analyzing $docid\n" if ($debug);
-    } elsif (m@<f\s+.*name="([^"]+)">([^<]+)</f>@) {
+    } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
       if ($1 eq "lemma") {
         $conll[$LEMMA_idx] = $2;
         $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
@@ -131,7 +131,7 @@
       $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
       @conll = ("_") x 10;
       $conll[$FORM_idx] = encode("utf-8", $token);
-    } elsif (m@</fs>@) {
+    } elsif (m@^\s*</fs>@) {
       my @vals = ($current_from, $current_to);
       print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
       push @current_lines, \@vals;