Enable conversion of (some) standard TEI P5

Providing an option --xmlid-to-textsigle
to convert P5 xml:id attributes to three part I5-sigles

Resolves #4
Resolves #5

Change-Id: I8fd23b7021c25cf4d80234a0570d06dea78e7813
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 5c74b91..deee2cc 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -58,6 +58,7 @@
   'data-file=s'           => \(my $data_file   = 'data'),
   'header-file=s'         => \(my $header_file = 'header'),
   'tokens-file=s'         => \(my $tokens_file = 'tokens'),
+  'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
   'log|l=s'               => \(my $log_level   = 'notice'),
   'required-version|rv=s' => \(my $required_version),
   ''                      => \(my $stdio),
@@ -94,13 +95,16 @@
 };
 
 
+my ($what, $with);
+if ($xmlid_to_textsigle ne '') {
+  ($what, $with) = split('@', $xmlid_to_textsigle);
+  $what = qr!$what!;
+};
+
 # tag (without attributes), which contains the primary text
 my $_TEXT_BODY = 'text';
 # optional
 
-# TODO: IDS-specific (and redundant)
-my $_HEADER_TAG = 'idsHeader';
-
 # Remember to skip certain inline tags
 my %skip_inline_tags = ();
 if ($skip_inline_tags_str) {
@@ -372,9 +376,41 @@
     };
   }
 
-  # Start of header section
-  elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
+  elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
+    my $leadin = $1;
+    my $id = $3;
+    my $sigle = $3;
 
+    if ($what) {
+      $_ = $id;
+      eval "s|$what|$with|";  # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
+      $sigle = $_;
+      $log->debug("Converted text id `$id' to sigle `$sigle'");
+    };
+    $sigle =~ s/\./-/g;
+
+    my @parts = split(/[\/_]/, $sigle);
+    if (@parts != 3) {
+      die $log->fatal(
+          "input line number $.: " .
+              "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
+              "=> Aborting (line=$_)");
+    };
+
+    $dir = join("/", @parts);
+    $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
+    $log->notice("$0: text_id=$text_id_esc");
+
+    if ($leadin !~ /^\s*$/) {
+      die $log->fatal(
+          "input line number $.: " .
+              'line with opening header tag is not in expected format ... ' .
+              "=> Aborting (line=$_)");
+    };
+  }
+
+  # Start of header section
+  elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
     my $content = "$2\n";
 
     if ($1 !~ /^\s*$/) {
@@ -385,7 +421,7 @@
     };
 
     # Parse header
-    my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
+    my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
 
     # Header was parseable
     if ($header) {
@@ -583,6 +619,21 @@
 Expects a comma-separated list of tags to be ignored when the structure
 is parsed. Content of these tags however will be processed.
 
+=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
+
+Expects a regular replacement expression (separated by a B<@> between the
+search and the replacement) to convert text id attributes to text sigles
+with three parts (separated by B</>).
+
+Example:
+
+  tei2korapxml  \
+    --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
+    -tk - < t/data/icc_german_sample.p5.xml
+
+Converts text id `ICC.German.DeReKo.WPD17.G11.00238' to
+sigle `ICCGER/DeReKo.WPD17/G11.00238'.
+
 =item B<--inline-tokens> <foundry>#[<file>]
 
 Define the foundry and file (without extension)