Add korapxml2conllu option -e <regex> to extract element/attributes

./script/korapxml2conllu -e '(posting/id|div/id)' -p "A0000" t/data/wdf19.zip  | head -12

 # foundry = base
 # filename = WDF19/A0000/10894/base/tokens.xml
 # text_id = WDF19_A0000.10894
 # start_offsets = 0 0 5 14 23 32 40 48 51 54 60
 # end_offsets = 61 4 12 22 31 39 47 50 53 59 61
 1	Arts	_	_	_	_	_	_	_	_
 2	visuels	_	_	_	_	_	_	_	_
 # div/id = i.10894_1
 # posting/id = i.10894_1_1
 3	Pourquoi	_	_	_	_	_	_	_	_
 4	toujours	_	_	_	_	_	_	_	_
 5	vouloir	_	_	_	_	_	_	_	_

Change-Id: I2cedc6580699fab0db6794d0f3225ea4da72b30f
diff --git a/Changes b/Changes
index 2b993fb..3d045db 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,5 @@
 0.3.900 unreleased
-
+        - korapxml2conllu option -e <regex> added to extract element/attributes to comments
 
 0.3 2021-02-15
         - Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 707db18..494b490 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu
@@ -28,6 +28,7 @@
 
 GetOptions(
   'sigle-pattern|p=s'            => \(my $sigle_pattern = ''),
+  'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
   'log|l=s'                      => \(my $log_level = 'warn'),
 
   'help|h'                       => sub {
@@ -47,14 +48,6 @@
   }
 );
 
- ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
-
- Results will be written to stdout
-EOF
-
-getopts('dhp:', \%opts);
-die $usage if($opts{h} || @ARGV == 0);
-my $debug=($opts{d}? 1 : 0);
 # Establish logger
 binmode(STDERR, ':encoding(UTF-8)');
 Log::Any::Adapter->set('Stderr', log_level => $log_level);
@@ -68,6 +61,7 @@
 my %processedFilenames;
 my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
 my $baseOnly;
+my %extras;
 
 my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
 
@@ -96,19 +90,22 @@
   die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
   
   my $first=1;
-  my $pattern = (defined($opts{p})? $opts{p} : '');
   my @conll = ("_") x 10;
   my $filename;
 
   $baseOnly = $morpho_zip eq $data_zip;
   my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
-  if(!$baseOnly) {
-    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
-    $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |";
+  if (!$baseOnly) {
+    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+    if ($extract_attributes_regex) {
+      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
+    } else {
+      $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
+    }
   } else {
     $foundry = "base";
-    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/tokens.xml' $zipsiglepattern |";
-    $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
+    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/tokens.xml' $zipsiglepattern |";
+    $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
   }
 
   open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
@@ -163,9 +160,18 @@
         $conll[$MISC_idx] = $2;
       }
     } elsif (/<span /) {
+      my $last_from = $current_from // -1;
       ($current_id) = /id="[^0-9]*([^\"]*)"/;
       ($current_from) = /from="([^\"]*)"/;
       ($current_to) = /to="([^\"]*)"/;
+      if($extract_attributes_regex) {
+        for (my $i = $last_from + 1; $i <= $current_from; $i++) {
+          if ($extras{$docid}{$i}) {
+            $current .= $extras{$docid}{$i};
+            undef $extras{$docid}{$i};
+          }
+        }
+      }
       $log->debug("found span: $current_id $current_from $current_to");
       $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
       if (!defined $token) {
@@ -251,6 +257,7 @@
   my ($target_id) = @_;
   my $docid;
   my $text_started=0;
+  my $text_count = 0;
   my ($current_id, $current_from, $current_to);
 
   if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
@@ -271,6 +278,19 @@
     } elsif(m@<f\s[^>]*>s</f>@) {
       $log->debug("Found sentence end for $docid \@$current_to");
       $sentence_ends{$docid}{$current_to}=1;
+    } elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
+      my $current_element = $1;
+      while(<PLAINTEXTPIPE>) {
+        last if(m@</fs>@);
+        if(m@<f\sname="([^"]+)"[^>]*>([^<]+)</f>@) {
+          my $current_node = "$current_element/$1";
+          my $value = $2;
+          if ($current_node =~ /$extract_attributes_regex/) {
+#            print "# $docid $current_from-$current_to :: $current_node = $value\n";
+            $extras{$docid}{$current_from} .= "# $current_node = $value\n";
+          }
+        }
+      }
     } elsif (m@<text>(.*)</text>@) {
       $_= decode("utf-8", $1, Encode::FB_DEFAULT);
       s/&lt;/</go;
@@ -278,7 +298,7 @@
       s/&amp;/&/go;
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
       $plain_texts{$docid} = $_;
-      last if($docid eq $target_id);
+      last if($text_count++ > 1 && $plain_texts{$target_id});
     } elsif (m@<text>(.*)@) {
       $_= decode("utf-8", $1, Encode::FB_DEFAULT);
       s/&lt;/</go;
@@ -295,7 +315,7 @@
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
       $plain_texts{$docid} .= $_;
       $text_started=0;
-      last if($docid eq $target_id);
+      last if($text_count++ > 1 && $plain_texts{$target_id});
     } elsif ($text_started) {
       chomp;
       $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
@@ -346,6 +366,11 @@
 =item B<--sigle-pattern|-p>
 
 Convert only texts from the KorAP XML zip files with folder names (i.e. sigles) matching the glob pattern.
+
+=item B<--extract-attribute-pattern|-e>
+
+Extract element/attribute regular expressions to comments.
+
 =item B<--help|-h>
 
 Print help information.
@@ -362,6 +387,9 @@
 =back
 
 =head1 EXAMPLES
+
+ korapxml2conllu -e '(posting/id|div/id)' t/data/wdf19.zip
+
 =head1 COPYRIGHT AND LICENSE
 
 Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
diff --git a/t/data/wdf19.zip b/t/data/wdf19.zip
new file mode 100644
index 0000000..61a8bdf
--- /dev/null
+++ b/t/data/wdf19.zip
Binary files differ
diff --git a/t/test.t b/t/test.t
index 22d261a..48bd2b1 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 10;
+use Test::More tests => 19;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -75,4 +75,15 @@
 script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
     "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
 script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
+
+script_runs([ 'script/korapxml2conllu', '-e',  'div/type', "t/data/goe.tree_tagger.zip" ], "Runs korapxml2conllu with morpho input and attribute extraction");
+script_stdout_like "\n# div/type = Autobiographie\n", "Extracts attributes from morpho zips";
+script_stdout_like "\n# div/type = section\n", "Extracts attributes from morpho zips";
+
+script_runs([ 'script/korapxml2conllu', '-e',  '(posting/id|div/id)', "t/data/wdf19.zip" ], "Runs korapxml2conllu with base input and regex attribute extraction");
+script_stdout_like "\n# posting/id = i.13075_11_45", "Extracts multiple attributes from base zips (1)";
+script_stdout_like "\n# div/id = i.13075_14", "Extracts multiple attributes from base zips (2)";
+script_stdout_like "\n# posting/id = i.14548_9_1\n3\tbonjour", "Extracts attributes in the right place";
+script_stdout_like "\n# posting/id = i.12610_4_4", "Extracts directly adjacent postings from base zips (1)";
+script_stdout_like "\n# posting/id = i.12610_4_5", "Extracts directly adjacent postings from base zips (2)";
 done_testing;