Add --auto-textsigle <start-sigle> option
Also allows for processing plain TEI P5 files without any IDs.
Change-Id: Ic16b089c916d2e50458aa1aa6cb80ce4d37d97ba
diff --git a/Changes b/Changes
index 191d067..a487f61 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
2.6.0 2024-09-19
- Add -o parameter.
- Add support for inline dependency relations.
+ - Add support for --auto-textsigle.
2.5.0 2024-01-24
- Upgrade minimal Perl version to 5.36 to improve
diff --git a/Readme.pod b/Readme.pod
index 1c95540..e890733 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -165,6 +165,17 @@
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
+=item B<--auto-textsigle> <textsigle>
+
+Expects a text sigle thats serves as fallback if no text sigles
+are given in the input data.
+The auto text sigle will be incremented for each text processed.
+
+Example:
+
+ tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
+ < data.i5.xml > korapxml.zip
+
=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
Expects a regular replacement expression (separated by B<@> between the
diff --git a/lib/KorAP/XML/TEI.pm b/lib/KorAP/XML/TEI.pm
index 1111c8b..f7768e7 100644
--- a/lib/KorAP/XML/TEI.pm
+++ b/lib/KorAP/XML/TEI.pm
@@ -4,7 +4,7 @@
use warnings;
use Exporter 'import';
-our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities);
+our @EXPORT_OK = qw(remove_xml_comments escape_xml escape_xml_minimal replace_entities increase_auto_textsigle);
# convert '&', '<' and '>' into their corresponding sgml-entities
my %ent_without_quot = (
@@ -180,4 +180,16 @@
return($_);
};
+sub increase_auto_textsigle {
+ my $sigle = shift;
+
+ if ($sigle =~ /(\d+)$/) {
+ my $number = $1;
+ my $length = length($number);
+ $number++;
+ my $new_number = sprintf("%0${length}d", $number);
+ $sigle =~ s/\d+$/$new_number/;
+ }
+ return $sigle;
+}
1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 418408e..86f7527 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -6,6 +6,7 @@
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
+use KorAP::XML::TEI qw(increase_auto_textsigle);
use File::Basename qw(dirname);
@@ -45,6 +46,7 @@
# Parse options from the command line
GetOptions(
+ 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
'root|r=s' => \(my $root_dir = '.'),
'input|i=s' => \(my $input_fname = ''),
'output|o=s' => \(my $output_fname = ''),
@@ -460,8 +462,11 @@
};
# Parse header
- my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
-
+ my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
+ if ($auto_textsigle) {
+ $auto_textsigle = increase_auto_textsigle($auto_textsigle);
+ $log->debug("Auto-incremented text sigle to $auto_textsigle");
+ };
# Header was parseable
if ($header) {
@@ -666,6 +671,17 @@
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
+=item B<--auto-textsigle> <textsigle>
+
+Expects a text sigle thats serves as fallback if no text sigles
+are given in the input data.
+The auto text sigle will be incremented for each text processed.
+
+Example:
+
+ tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
+ < data.i5.xml > korapxml.zip
+
=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
Expects a regular replacement expression (separated by B<@> between the