Enable conversion of (some) standard TEI P5
Providing an option --xmlid-to-textsigle
to convert P5 xml:id attributes to three part I5-sigles
Resolves #4
Resolves #5
Change-Id: I8fd23b7021c25cf4d80234a0570d06dea78e7813
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 5c74b91..deee2cc 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -58,6 +58,7 @@
'data-file=s' => \(my $data_file = 'data'),
'header-file=s' => \(my $header_file = 'header'),
'tokens-file=s' => \(my $tokens_file = 'tokens'),
+ 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
'log|l=s' => \(my $log_level = 'notice'),
'required-version|rv=s' => \(my $required_version),
'' => \(my $stdio),
@@ -94,13 +95,16 @@
};
+my ($what, $with);
+if ($xmlid_to_textsigle ne '') {
+ ($what, $with) = split('@', $xmlid_to_textsigle);
+ $what = qr!$what!;
+};
+
# tag (without attributes), which contains the primary text
my $_TEXT_BODY = 'text';
# optional
-# TODO: IDS-specific (and redundant)
-my $_HEADER_TAG = 'idsHeader';
-
# Remember to skip certain inline tags
my %skip_inline_tags = ();
if ($skip_inline_tags_str) {
@@ -372,9 +376,41 @@
};
}
- # Start of header section
- elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
+ elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
+ my $leadin = $1;
+ my $id = $3;
+ my $sigle = $3;
+ if ($what) {
+ $_ = $id;
+ eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
+ $sigle = $_;
+ $log->debug("Converted text id `$id' to sigle `$sigle'");
+ };
+ $sigle =~ s/\./-/g;
+
+ my @parts = split(/[\/_]/, $sigle);
+ if (@parts != 3) {
+ die $log->fatal(
+ "input line number $.: " .
+ "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
+ "=> Aborting (line=$_)");
+ };
+
+ $dir = join("/", @parts);
+ $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
+ $log->notice("$0: text_id=$text_id_esc");
+
+ if ($leadin !~ /^\s*$/) {
+ die $log->fatal(
+ "input line number $.: " .
+ 'line with opening header tag is not in expected format ... ' .
+ "=> Aborting (line=$_)");
+ };
+ }
+
+ # Start of header section
+ elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
my $content = "$2\n";
if ($1 !~ /^\s*$/) {
@@ -385,7 +421,7 @@
};
# Parse header
- my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
+ my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
# Header was parseable
if ($header) {
@@ -583,6 +619,21 @@
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
+=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
+
+Expects a regular replacement expression (separated by a B<@> between the
+search and the replacement) to convert text id attributes to text sigles
+with three parts (separated by B</>).
+
+Example:
+
+ tei2korapxml \
+ --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
+ -tk - < t/data/icc_german_sample.p5.xml
+
+Converts text id `ICC.German.DeReKo.WPD17.G11.00238' to
+sigle `ICCGER/DeReKo.WPD17/G11.00238'.
+
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)