Test and benchmark conversion of inline annotations
Change-Id: I2eaabb35373b2a4c87c329a4a5254a5f347e989c
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 66f36b3..d1bb176 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -109,7 +109,7 @@
## TODO: optional
# handling inline annotations (inside $_TOKENS_TAG)
-my $_INLINE_ANNOT = 0; # on/off: set to 1 if inline annotations are present and should be processed (default: 0)
+my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE}?1:0; # on/off: set to 1 if inline annotations are present and should be processed (default: 0)
my $_INLINE_LEM_RD = "lemma"; # from which attribute to read LEMMA information
my $_INLINE_ATT_RD = "ana"; # from which attribute to read POS information (and evtl. additional MSD - Morphosyntactic Descriptions)
# TODO: The format for the POS and MSD information has to suffice the regular expression ([^ ]+)( (.+))?
diff --git a/t/script.t b/t/script.t
index 31ff24b..6af3095 100644
--- a/t/script.t
+++ b/t/script.t
@@ -390,4 +390,74 @@
);
};
+
+subtest 'Check Inline annotations' => sub {
+
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample_tagged.i5.xml');
+
+ my ($fh, $outzip) = korap_tempfile('script_tagged');
+
+ # Generate zip file (unportable!)
+ stderr_like(
+ sub { `cat '$file' | KORAPXMLTEI_INLINE=1 perl '$script' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+ 'Processing'
+ );
+
+ ok(-e $outzip, "File $outzip exists");
+
+ my $zip = IO::Uncompress::Unzip->new(
+ $outzip,
+ Name => 'GOE/AGA/00000/tokens/morpho.xml'
+ );
+ ok($zip, 'Inline annotations');
+
+ my $tokens;
+ $tokens .= $zip->getline while !$zip->eof;
+ ok($zip->close, 'Closed');
+
+ my $t = Test::XML::Loy->new($tokens);
+
+ $t->attr_is('layer', 'docid', 'GOE_AGA.00000')
+ ->attr_is('spanList span:nth-child(1)', 'id', 's0')
+ ->attr_is('spanList span:nth-child(1)', 'from', '75')
+ ->attr_is('spanList span:nth-child(1)', 'to', '81')
+ ->attr_is('spanList span:nth-child(1)', 'l', '7')
+
+ ->attr_is('span#s0 > fs', 'type', 'lex')
+ ->attr_is('span#s0 > fs', 'xmlns', 'http://www.tei-c.org/ns/1.0')
+ ->attr_is('span#s0 > fs > f > fs > f:nth-child(1)', 'name', 'pos')
+ ->text_is('span#s0 > fs > f > fs > f:nth-child(1)', 'A')
+ ->attr_is('span#s0 > fs > f > fs > f:nth-child(2)', 'name', 'msd')
+ ->text_is('span#s0 > fs > f > fs > f:nth-child(2)', '@NH')
+
+ ->attr_is('span#s25', 'from', '259')
+ ->attr_is('span#s25', 'to', '263')
+ ->attr_is('span#s25', 'l', '7')
+ ->attr_is('span#s25 > fs > f > fs > f:nth-child(1)', 'name', 'pos')
+ ->text_is('span#s25 > fs > f > fs > f:nth-child(1)', 'PRON')
+ ->attr_is('span#s25 > fs > f > fs > f:nth-child(2)', 'name', 'msd')
+ ->text_is('span#s25 > fs > f > fs > f:nth-child(2)', '@NH')
+
+ ->attr_is('span#s58', 'from', '495')
+ ->attr_is('span#s58', 'to', '500')
+ ->attr_is('span#s58', 'l', '7')
+ ->attr_is('span#s58 > fs > f > fs > f:nth-child(1)', 'name', 'pos')
+ ->text_is('span#s58 > fs > f > fs > f:nth-child(1)', 'N')
+ ->attr_is('span#s58 > fs > f > fs > f:nth-child(2)', 'name', 'msd')
+ ->text_is('span#s58 > fs > f > fs > f:nth-child(2)', '@NH')
+
+ ->attr_is('span#s119', 'from', '914')
+ ->attr_is('span#s119', 'to', '925')
+ ->attr_is('span#s119', 'l', '7')
+ ->attr_is('span#s119 > fs > f > fs > f:nth-child(1)', 'name', 'pos')
+ ->text_is('span#s119 > fs > f > fs > f:nth-child(1)', 'A')
+ ->attr_is('span#s119 > fs > f > fs > f:nth-child(2)', 'name', 'msd')
+ ->text_is('span#s119 > fs > f > fs > f:nth-child(2)', '@NH')
+ ->element_exists_not('span#s120')
+ ;
+};
+
+
done_testing;
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 163b85b..71dbe54 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -36,8 +36,9 @@
my $f = dirname(__FILE__);
my $script = rel2abs(catfile($f, '..', 'script', $SCRIPT_NAME));
-# Load example file
+# Load example files
my $file = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample.i5.xml'));
+my $goe_tagged = rel2abs(catfile($f, '..', 't', 'data', 'goe_sample_tagged.i5.xml'));
# Create a new benchmark object
my $bench = Dumbbench->new(
@@ -86,6 +87,12 @@
}
),
Dumbbench::Instance::PerlSub->new(
+ name => 'Conversion-with-inline-annotations',
+ code => sub {
+ `cat '$goe_tagged' | KORAPXMLTEI_INLINE=1 perl '$script' > /dev/null 2>&1`
+ }
+ ),
+ Dumbbench::Instance::PerlSub->new(
name => 'delHTMLcom',
code => sub {
for (1..100_000) {