Add support for inline dependency structures (fixes #7)
Change-Id: I25781e1a285a6bd6345ceb5e5487b410e9bd5353
diff --git a/script/tei2korapxml b/script/tei2korapxml
index c150c04..418408e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -40,6 +40,9 @@
# Inline tokens won't be stored in the structure file
my $inline_tokens_exclusive = 0;
+# Inline dependencies won't be stored in the tokens file
+my $inline_deps_exclusive = 0;
+
# Parse options from the command line
GetOptions(
'root|r=s' => \(my $root_dir = '.'),
@@ -52,8 +55,9 @@
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
+ 'inline-dependencies=s' => \(my $inline_dependencies),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
- 'skip-inline-token-annotations' => \(
+ 'skip-inline-token-annotations!' => \(
my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
'base-foundry=s' => \(my $base_dir = 'base'),
@@ -144,7 +148,7 @@
exit(1);
};
-if ($use_tokenizer_sentence_splits) {
+if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
$skip_inline_tags{s} = 1;
};
@@ -166,6 +170,19 @@
$inline_tokens_exclusive = 1;
};
+
+my ($_dep_dir, $_dep_file);
+if ($inline_dependencies) {
+ ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
+ $inline_dependencies = 1;
+
+ if ($_dep_dir && index($_dep_dir, '!') == 0) {
+ $_dep_dir = substr($_dep_dir, 1);
+ $inline_deps_exclusive = 1;
+ };
+};
+
+
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
@@ -216,7 +233,8 @@
my $inline = KorAP::XML::TEI::Inline->new(
$skip_inline_tokens,
\%skip_inline_tags,
- $inline_tokens_exclusive
+ $inline_tokens_exclusive,
+ $inline_dependencies
);
@@ -320,7 +338,7 @@
};
# ~ write structures ~
- if (!$inline->structures->empty) {
+ unless ($inline->structures->empty) {
$inline->structures->to_zip(
$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
$text_id_esc,
@@ -333,11 +351,23 @@
$inline->tokens->to_zip(
$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
$text_id_esc,
- # Either 0 = tokens without inline or 1 = tokens with inline
- !$skip_inline_token_annotations
+ # Either 0 = tokens without inline or
+ # 1 = tokens with inline
+ # !$skip_inline_token_annotations
+ ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
);
};
+ # ~ write dependencies ~
+ unless ($inline->dependencies->empty) {
+ $inline->dependencies->to_zip(
+ $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
+ $text_id_esc,
+ 3 # = dependency serialization
+ );
+ };
+
+
# reinit.
$dir = '';
@@ -628,7 +658,8 @@
Boolean flag indicating that inline token annotations should not
be processed. Defaults to true (meaning inline token annotations
-won't be processed).
+won't be processed). Can be negated with
+C<--no-skip-inline-token-annotations>.
=item B<--skip-inline-tags> <tags>
@@ -667,7 +698,30 @@
Example:
- tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
+ tei2korapxml --no-tokenizer --inline-tokens \
+ '!gingko#morpho' < data.i5.xml > korapxml.zip
+
+=item B<--inline-dependencies> <foundry>#[<file>]
+
+Define the foundry and file (without extension)
+to store inline dependency information in.
+Defaults to the layer of C<dependency> and
+will be ignored if not set (which means, dependency
+attributes will be stored in the inline tokens file,
+if not skipped).
+
+The dependency data will also be stored in the
+inline token file (see I<--inline-tokens>),
+unless the inline dependencies foundry is prepended
+by an B<!> exclamation mark, indicating that inline
+dependency data is stored exclusively in the inline
+dependencies file.
+
+Example:
+
+ tei2korapxml --no-tokenizer --inline-dependencies \
+ 'gingko#dependency' < data.i5.xml > korapxml.zip
+
=item B<--inline-structures> <foundry>#[<file>]
@@ -727,7 +781,7 @@
=head1 COPYRIGHT AND LICENSE
-Copyright (C) 2021-2023, L<IDS Mannheim|https://www.ids-mannheim.de/>
+Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Peter Harders