Restructure test suite to prepare CPAN release Change-Id: If3033774f50d33b2e5b3344e3927fd534cef4dfb

commit: 414ec9567c67cf1648e447bc9c47c351ae5ae34a [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Aug 03 15:48:43 2020 +0200
committer: Akron <nils@diewald-online.de> Tue Aug 04 08:55:32 2020 +0200
tree: ef4311389b9abcf8811d24664d36988d3144000f
parent: fab17d3e0c697969e4e228f5d0bfccbbc7130703 [diff] [blame]
diff --git a/t/real/transform.t b/t/real/transform.t
new file mode 100644
index 0000000..cf7e636
--- /dev/null
+++ b/t/real/transform.t

@@ -0,0 +1,205 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use JSON::XS;
+use Benchmark ':hireswallclock';
+use lib 'lib', '../lib';
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+if ($ENV{SKIP_REAL}) {
+  plan skip_all => 'Skip real tests';
+};
+
+use_ok('KorAP::XML::Krill');
+
+sub _t2h {
+  my $string = shift;
+  $string =~ s/^\[\(\d+?-\d+?\)(.+?)\]$/$1/;
+  my %hash = ();
+  foreach (split(qr!\|!, $string)) {
+    $hash{$_} = 1;
+  };
+  return \%hash;
+};
+
+my @layers;
+# push(@layers, ['Base', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
+
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
+push(@layers, ['OpenNLP', 'Sentences']);
+
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
+push(@layers, ['CoreNLP', 'Sentences']);
+
+# push(@layers, ['DeReKo', 'Structure']);
+
+# push(@layers, ['Glemm', 'Morpho']);
+
+# push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
+
+# push(@layers, ['Malt', 'Dependency']);
+
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
+push(@layers, ['Connexor', 'Sentences']);
+
+
+# TODO: OpenNLP
+
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
+push(@layers, ['TreeTagger', 'Sentences']);
+
+# Mate
+#push(@layers, ['Mate', 'Morpho']);
+#push(@layers, ['Mate', 'Dependency']);
+
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+# push(@layers, ['XIP', 'Dependency']); # Will be skipped
+push(@layers, ['XIP', 'Sentences']);
+
+
+my $path = catdir(dirname(__FILE__), qw!corpus WPD 00001!);
+ok(my $doc = KorAP::XML::Krill->new( path => $path . '/' ), 'Load Korap::Document');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
+
+ok($doc = KorAP::XML::Krill->new( path => $path ), 'Load Korap::Document');
+like($doc->path, qr!\Q$path\E/$!, 'Path');
+
+ok($doc->parse, 'Parse document');
+
+# Metdata
+my $meta = $doc->meta;
+is($meta->{T_title}, 'A', 'title');
+ok(!$meta->{T_sub_title}, 'subTitle');
+
+is($doc->text_sigle, 'WPD/AAA/00001', 'ID');
+is($doc->corpus_sigle, 'WPD', 'corpusID');
+
+is($meta->{D_pub_date}, '20050328', 'pubDate');
+is($meta->{S_pub_place}, 'URL:http://de.wikipedia.org', 'pubPlace');
+is($meta->{K_text_class}->[0], 'freizeit-unterhaltung', 'TextClass');
+is($meta->{K_text_class}->[1], 'reisen', 'TextClass');
+is($meta->{K_text_class}->[2], 'wissenschaft', 'TextClass');
+is($meta->{K_text_class}->[3], 'populaerwissenschaft', 'TextClass');
+ok(!$meta->{K_text_class}->[4], 'TextClass');
+is($meta->{T_author}, 'Ruru; Jens.Ol; Aglarech; u.a.', 'author');
+
+# Get tokens
+use_ok('KorAP::XML::Tokenizer');
+# Get tokenization
+ok(my $tokens = KorAP::XML::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => 'OpenNLP',
+  layer => 'Tokens',
+  name => 'tokens'
+), 'New Tokenizer');
+ok($tokens->parse, 'Parse');
+
+like($tokens->path, qr!\Q$path\E/$!, 'Path');
+is($tokens->foundry, 'OpenNLP', 'Foundry');
+is($tokens->doc->text_sigle, 'WPD/AAA/00001', 'Doc id');
+is($tokens->should, 1068, 'Should');
+is($tokens->have, 923, 'Have');
+is($tokens->name, 'tokens', 'Name');
+is($tokens->layer, 'Tokens', 'Layer');
+
+
+is_deeply(_t2h($tokens->stream->pos(118)->to_string),
+   _t2h('[(763-768)s:Linie|i:linie|_118$<i>763<i>768]'),
+   'Token is correct');
+
+# Add Mate
+ok($tokens->add('Mate', 'Morpho'), 'Add Mate');
+
+
+is_deeply(
+  _t2h($tokens->stream->pos(118)->to_string),
+  _t2h('[(763-768)s:Linie|i:linie|_118$<i>763<i>768|mate/l:linie|mate/p:NN|mate/m:case:acc|mate/m:number:sg|mate/m:gender:fem]'),
+  'with Mate');
+
+# Add sentences
+ok($tokens->add('Base', 'Sentences'), 'Add Sentences');
+
+is_deeply(
+  _t2h($tokens->stream->pos(0)->to_string),
+  _t2h('[(0-1)s:A|i:a|_0$<i>0<i>1|-:tokens$<i>923|mate/p:XY|<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|-:base/sentences$<i>96]'),
+  'Startinfo'
+);
+
+foreach (@layers) {
+  ok($tokens->add(@$_), 'Add '. join(', ', @$_));
+};
+
+my $s =
+  '[(0-1)s:A|i:a|_0$<i>0<i>1|'.
+  '-:tokens$<i>923|'.
+  'mate/p:XY|'.
+  '<>:base/s:s$<b>64<i>0<i>74<i>13<b>2|'.
+  '<>:base/s:t$<b>64<i>0<i>6083<i>923<b>0|'.
+  '-:base/sentences$<i>96|'.
+  '<>:base/s:p$<b>64<i>0<i>224<i>34<b>1|'.
+  '-:base/paragraphs$<i>76|'.
+  'opennlp/p:NE|' .
+  '<>:opennlp/s:s$<b>64<i>0<i>74<i>13<b>0|'.
+  '-:opennlp/sentences$<i>50|'.
+  '<>:corenlp/s:s$<b>64<i>0<i>6<i>2<b>0|'.
+  '-:corenlp/sentences$<i>67|'.
+  'cnx/l:A|'.
+  'cnx/p:N|'.
+  'cnx/syn:@NH|'.
+  '<>:cnx/c:np$<b>64<i>0<i>1<i>1<b>0|'.
+  '<>:cnx/s:s$<b>64<i>0<i>74<i>13<b>0|'.
+  '-:cnx/sentences$<i>63|'.
+#  'tt/l:A|'.
+  'tt/p:NN$<b>129<b>199|'.
+  'tt/l:A$<b>129<b>253|'.
+  'tt/p:FM$<b>129<b>54|'.
+  '<>:tt/s:s$<b>64<i>0<i>6083<i>923<b>0|'.
+  '-:tt/sentences$<i>1|'.
+  '>:mate/d:PNC$<b>32<i>2|' . #<s>0<s>0|'.
+#  'mate/d:&&&$<b>128<s>1|'.
+  'xip/p:SYMBOL|'.
+  'xip/l:A|'.
+  '<>:xip/c:TOP$<b>64<i>0<i>74<i>13<b>0|'.
+  '<>:xip/c:MC$<b>64<i>0<i>73<i>13<b>1|'.
+  '<>:xip/c:NP$<b>64<i>0<i>1<i>1<b>2|'.
+  '<>:xip/c:NPA$<b>64<i>0<i>1<i>1<b>3|'.
+  '<>:xip/c:NOUN$<b>64<i>0<i>1<i>1<b>4|'.
+  '<>:xip/c:SYMBOL$<b>64<i>0<i>1<i>1<b>5|'.
+#  '>:xip/d:SUBJ$<i>3|'.
+#  '<:xip/d:COORD$<i>1|'.
+  '<>:xip/s:s$<b>64<i>0<i>74<i>13<b>0|'.
+  '-:xip/sentences$<i>65]';
+
+{
+  local $SIG{__WARN__} = sub {};
+  is_deeply(
+    _t2h($tokens->stream->pos(0)->to_string),
+    _t2h($s),
+    'Startinfo');
+};
+
+
+is($tokens->layer_info,
+   'base/s=spans cnx/c=spans cnx/l=tokens cnx/m=tokens cnx/p=tokens cnx/s=spans cnx/syn=tokens corenlp/ne=tokens corenlp/s=spans mate/d=rels mate/l=tokens mate/m=tokens mate/p=tokens opennlp/p=tokens opennlp/s=spans tt/l=tokens tt/p=tokens tt/s=spans xip/c=spans xip/l=tokens xip/p=tokens xip/s=spans', 'Layer info');
+# xip/d=rels
+
+is($tokens->support, 'base base/paragraphs base/sentences connexor connexor/morpho connexor/phrase connexor/sentences connexor/syntax corenlp corenlp/namedentities corenlp/namedentities corenlp/namedentities/ne_dewac_175m_600 corenlp/namedentities/ne_hgc_175m_600 corenlp/sentences mate mate/dependency mate/morpho opennlp opennlp/morpho opennlp/sentences treetagger treetagger/morpho treetagger/sentences xip xip/constituency xip/morpho xip/sentences', 'Support');
+# xip/dependency
+done_testing;
+__END__
commit	414ec9567c67cf1648e447bc9c47c351ae5ae34a	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Aug 03 15:48:43 2020 +0200
committer	Akron <nils@diewald-online.de>	Tue Aug 04 08:55:32 2020 +0200
tree	ef4311389b9abcf8811d24664d36988d3144000f
parent	fab17d3e0c697969e4e228f5d0bfccbbc7130703 [diff] [blame]