Add pti to base
Change-Id: Ibdc100f05bdd13862cd3d6232978133d9f498245
diff --git a/lib/KorAP/Field/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
index 009e67e..ddd6880 100644
--- a/lib/KorAP/Field/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -104,36 +104,35 @@
sub to_string {
my $string = _escape_term($_[0]->[5]);
- $string .= '$';
+ my $pre;
# PTI
- $string .= '<b>' .
- ($_[0]->[10] ? $_[0]->[10] : '???');
+ $pre .= '<b>' . $_[0]->[10] if $_[0]->[10];
# Offsets
if (defined $_[0]->[3]) {
- $string .= '<i>' .$_[0]->[3] .
+ $pre .= '<i>' .$_[0]->[3] .
'<i>' . $_[0]->[4];
};
- my $pl = $_[0]->[1] ?
- $_[0]->[1] - 1 : $_[0]->[0];
+# my $pl = $_[0]->[1] ?
+# $_[0]->[1] - 1 : $_[0]->[0];
if ($_[0]->[2] || $_[0]->[0]) {
if ($_[0]->[2]) {
- $string .= '<i>' . $_[0]->[2];
+ $pre .= '<i>' . $_[0]->[2];
};
if ($_[0]->[0]) {
if (index($_[0]->[0], '<') == 0) {
- $string .= $_[0]->[0];
+ $pre .= $_[0]->[0];
}
else {
- $string .= '<?>' . $_[0]->[0];
+ $pre .= '<?>' . $_[0]->[0];
};
};
};
- $string;
+ $string . ($pre ? '$' . $pre : '');
};
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 5046c4c..1cb2e99 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -4,30 +4,35 @@
sub parse {
my $self = shift;
my $i = 0;
+
$$self->add_spandata(
foundry => 'base',
- layer => 'struct', # formerly paragraph
+ layer => 'paragraph',
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
+
$mtt->add(
term => '<>:base/s:p',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>1'
+ payload => '<b>1',
+ pti => 64
);
$i++;
}
) or return;
+ # Add information about paragraph number
$$self->stream->add_meta('base/paragraphs', '<i>' . $i);
return 1;
};
+
sub layer_info {
- ['base/s=spans'];
+ ['base/s=spans'];
};
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index edeef43..449f331 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -9,7 +9,7 @@
$$self->add_spandata(
foundry => 'base',
- layer => 'struct', # formerly sentence
+ layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
@@ -19,7 +19,8 @@
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end,
- payload => '<b>2'
+ payload => '<b>2',
+ pti => 64
);
$last_p = $span->p_end;
$last_o = $span->o_end;
@@ -33,7 +34,8 @@
o_start => $first->[1],
p_end => $last_p,
o_end => $last_o,
- payload => '<b>0'
+ payload => '<b>0',
+ pti => 64
);
$$self->stream->add_meta('base/sentences', '<i>' . $i);
@@ -42,7 +44,7 @@
};
sub layer_info {
- ['base/s=spans'];
+ ['base/s=spans'];
};
1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index b609223..7e26f32 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -377,6 +377,9 @@
# Get layerinfo
$self->layer_info($mod->layer_info);
return $retval;
+ }
+ else {
+ $self->log->error('Unable to parse '.$mod);
};
}
else {
@@ -514,7 +517,7 @@
$tokens{layerInfos} = $self->layer_info;
$data{data} = \%tokens;
- $data{version} = '0.02';
+ $data{version} = '0.03';
};
\%data;
diff --git a/t/index/base_paragraphs.t b/t/index/base_paragraphs.t
index 2abc198..f2dcfa9 100644
--- a/t/index/base_paragraphs.t
+++ b/t/index/base_paragraphs.t
@@ -4,6 +4,7 @@
use utf8;
use Test::More;
use Scalar::Util qw/weaken/;
+use Data::Dumper;
use_ok('KorAP::Document');
@@ -36,29 +37,14 @@
ok($tokens->add('Base', 'Paragraphs'), 'Add Structure');
+my $data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!base/paragraphs!, 'data');
+is($data->{stream}->[0]->[0], '-:base/paragraphs$<i>1', 'Number of paragraphs');
+is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
+is($data->{stream}->[0]->[2], '<>:base/s:p$<b>64<i>0<i>129<i>17<b>1', 'Paragraph');
+is($data->{stream}->[0]->[3], '_0$<i>0<i>3', 'Position');
done_testing;
__END__
-
-
-
-
-
-done_testing;
-__END__
-
-
-sub new_tokenizer {
- my $x = $doc;
- weaken $x;
- return KorAP::Tokenizer->new(
- path => $x->path,
- doc => $x,
- foundry => 'DeReKo',
- layer => 'Structure',
- name => 'spans'
- )
-};
-
-__END__
diff --git a/t/index/base_sentences.t b/t/index/base_sentences.t
new file mode 100644
index 0000000..45c7ab3
--- /dev/null
+++ b/t/index/base_sentences.t
@@ -0,0 +1,51 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use utf8;
+use Test::More;
+use Scalar::Util qw/weaken/;
+use Data::Dumper;
+
+use_ok('KorAP::Document');
+
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
+
+my $path = catdir(dirname(__FILE__), 'corpus', 'doc', 'text');
+
+ok(my $doc = KorAP::Document->new(
+ path => $path . '/'
+), 'Load Korap::Document');
+
+like($doc->path, qr!$path/$!, 'Path');
+ok($doc->parse, 'Parse document');
+
+ok($doc->primary->data, 'Primary data in existence');
+is($doc->primary->data_length, 129, 'Data length');
+
+use_ok('KorAP::Tokenizer');
+
+ok(my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'OpenNLP',
+ layer => 'Tokens',
+ name => 'tokens'
+), 'New Tokenizer');
+
+ok($tokens->parse, 'Parse');
+
+ok($tokens->add('Base', 'Sentences'), 'Add Structure');
+
+my $data = $tokens->to_data->{data};
+
+like($data->{foundries}, qr!base/sentences!, 'data');
+is($data->{stream}->[0]->[0], '-:base/sentences$<i>1', 'Number of paragraphs');
+is($data->{stream}->[0]->[1], '-:tokens$<i>18', 'Number of tokens');
+is($data->{stream}->[0]->[2], '<>:base/s:t$<b>64<i>0<i>129<i>17<b>0', 'Text');
+is($data->{stream}->[0]->[3], '<>:base/s:s$<b>64<i>0<i>129<i>17<b>2', 'Sentence');
+is($data->{stream}->[0]->[4], '_0$<i>0<i>3', 'Position');
+
+done_testing;
+
+__END__