Introduce exclusivity for inline token handling
Change-Id: Ia34ec87b2c55aabd94c65ec1e0d63d0cadb27d75
diff --git a/Changes b/Changes
index 992c8f8..eaa432b 100644
--- a/Changes
+++ b/Changes
@@ -12,6 +12,7 @@
- Improve script handling of broken data
- Improve handling of unknown header types
- Check for valid sigles to avoid broken directories
+ - Introduce exclusivity for inline tokens handling.
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries
diff --git a/Readme.pod b/Readme.pod
index 5c976c4..79180c4 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -141,6 +141,17 @@
this will contain annotations as well.
Defaults to C<tokens> and C<morpho>.
+The inline token data will also be stored in the
+inline structures file (see I<--inline-structures>),
+unless the inline token foundry is prepended
+by an B<!> exclamation mark, indicating that inline
+tokens are stored exclusively in the inline tokens
+file.
+
+Example:
+
+ tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
+
=item B<--inline-structures> <foundry>#[<file>]
Define the foundry and file (without extension)
diff --git a/lib/KorAP/XML/TEI/Inline.pm b/lib/KorAP/XML/TEI/Inline.pm
index 5446deb..fd83062 100644
--- a/lib/KorAP/XML/TEI/Inline.pm
+++ b/lib/KorAP/XML/TEI/Inline.pm
@@ -37,13 +37,14 @@
TOKENS => 5,
STRUCTURES => 6,
SKIP_INLINE_TAGS => 7,
- SKIP_INLINE_TOKENS => 8
+ SKIP_INLINE_TOKENS => 8,
+ INLINE_TOKENS_EXCLUSIVE => 9
};
# Constructor
sub new {
- my ($class, $skip_inline_tokens, $skip_inline_tags) = @_;
+ my ($class, $skip_inline_tokens, $skip_inline_tags, $inline_tokens_exclusive) = @_;
my @self = ();
@@ -67,9 +68,10 @@
$self[TOKENS] = KorAP::XML::TEI::Annotations::Collector->new;
# Initialize structure collector
- $self[STRUCTURES] = KorAP::XML::TEI::Annotations::Collector->new;
- $self[SKIP_INLINE_TOKENS] = $skip_inline_tokens // undef;
- $self[SKIP_INLINE_TAGS] = $skip_inline_tags // {};
+ $self[STRUCTURES] = KorAP::XML::TEI::Annotations::Collector->new;
+ $self[SKIP_INLINE_TOKENS] = $skip_inline_tokens // undef;
+ $self[INLINE_TOKENS_EXCLUSIVE] = $inline_tokens_exclusive // 0;
+ $self[SKIP_INLINE_TAGS] = $skip_inline_tags // {};
bless \@self, $class;
};
@@ -137,12 +139,26 @@
next;
};
- my $anno = $self->[STRUCTURES]->add_new_annotation($node_info);
+ my $anno = KorAP::XML::TEI::Annotations::Annotation->new($node_info);
- # Add element also to token list
- if (!$self->[SKIP_INLINE_TOKENS] && $node_info eq $_TOKENS_TAG) {
- $self->[TOKENS]->add_annotation($anno);
- };
+ # Is token tag
+ if ($node_info eq $_TOKENS_TAG) {
+
+ # Do not add tokens to the structure file
+ unless ($self->[INLINE_TOKENS_EXCLUSIVE]) {
+ $self->[STRUCTURES]->add_annotation($anno);
+ }
+
+ # Add tokens to the token list
+ if (!$self->[SKIP_INLINE_TOKENS]) {
+ $self->[TOKENS]->add_annotation($anno);
+ };
+ }
+
+ # Not token tag
+ else {
+ $self->[STRUCTURES]->add_annotation($anno);
+ }
# Handle attributes (if attributes exist)
if (defined $e->[3]) {
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 10a2787..0d196a5 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -42,6 +42,9 @@
warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
};
+# Inline tokens won't be stored in the structure file
+my $inline_tokens_exclusive = 0;
+
# Parse options from the command line
GetOptions(
'root|r=s' => \(my $root_dir = '.'),
@@ -136,6 +139,11 @@
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
+if (index($_tokens_dir, '!') == 0) {
+ $_tokens_dir = substr($_tokens_dir, 1);
+ $inline_tokens_exclusive = 1;
+};
+
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
@@ -168,7 +176,8 @@
# Create inline parser object
my $inline = KorAP::XML::TEI::Inline->new(
$skip_inline_tokens,
- \%skip_inline_tags
+ \%skip_inline_tags,
+ $inline_tokens_exclusive
);
@@ -532,6 +541,17 @@
this will contain annotations as well.
Defaults to C<tokens> and C<morpho>.
+The inline token data will also be stored in the
+inline structures file (see I<--inline-structures>),
+unless the inline token foundry is prepended
+by an B<!> exclamation mark, indicating that inline
+tokens are stored exclusively in the inline tokens
+file.
+
+Example:
+
+ tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
+
=item B<--inline-structures> <foundry>#[<file>]
Define the foundry and file (without extension)
diff --git a/t/inline.t b/t/inline.t
index d5a1db2..76ff74d 100644
--- a/t/inline.t
+++ b/t/inline.t
@@ -250,4 +250,27 @@
;
};
+
+subtest 'Treatment of tokens' => sub {
+ my $inline = KorAP::XML::TEI::Inline->new(0, {b => 1}, 1);
+
+ ok($inline->parse('aaa', \'<a>Der</a> <b>alte</b> <w pos="NN">Baum</w>'), 'Parsed');
+ is($inline->data->data, 'Der alte Baum');
+
+ # Only contains '<a>'
+ Test::XML::Loy->new($inline->structures->to_string('aaa', 1))
+ ->attr_is('#s1', 'to', 3)
+ ->element_exists_not('#s2')
+ ;
+
+ # Only contains 'w'
+ Test::XML::Loy->new($inline->tokens->to_string('aaa', 1))
+ ->attr_is('#s0', 'from', 9)
+ ->attr_is('#s0', 'to', 13)
+ ->attr_is('#s0 > fs > f > fs > f', 'name', 'pos')
+ ->text_is('#s0 > fs > f > fs > f[name=pos]', 'NN')
+ ->element_exists_not('#s1')
+ ;
+};
+
done_testing;