Fixed sorting
Change-Id: Ifdd820f8788271bb94047367117bf573fad62136
diff --git a/MANIFEST b/MANIFEST
index b37db9c..227d839 100755
--- a/MANIFEST
+++ b/MANIFEST
@@ -31,6 +31,8 @@
lib/KorAP/XML/Index/OpenNLP/Morpho.pm
lib/KorAP/XML/Index/OpenNLP/Sentences.pm
lib/KorAP/XML/Index/OpenNLP/Morpho.pm
+lib/KorAP/XML/Index/Sgbr/Morpho.pm
+lib/KorAP/XML/Index/Sgbr/Lemma.pm
lib/KorAP/XML/Index/TreeTagger/Morpho.pm
lib/KorAP/XML/Index/TreeTagger/Sentences.pm
lib/KorAP/XML/Index/XIP/Constituency.pm
@@ -636,7 +638,9 @@
t/corpus/WDD/G27/38989/mate/pipeline/one_token_per_line.txt
t/corpus/WDD/G27/38989/mate/pipeline/parsed.txt
t/corpus/WDD/G27/38989/mate/tokenSpans/number_tokenSpans.xml
+script/korapxml2krill
+script/korapxml2krill_dir
Changes
-MANIFEST
LICENSE
-Makefile.PL
\ No newline at end of file
+Makefile.PL
+MANIFEST
\ No newline at end of file
diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index 5019e2f..52c3d20 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm
@@ -121,10 +121,7 @@
# Order attributes by reference id
if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
-
-# use Data::Dumper;
-# die Dumper $a;
-
+ # Check TUI
my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
if ($a_id > $b_id) {
@@ -198,8 +195,8 @@
# Check depth
else {
- my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)$/ : 0);
- my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)$/ : 0);
+ my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
+ my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
$a_depth //= 0;
$b_depth //= 0;
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 4453274..b2af20c 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -456,40 +456,6 @@
$self->publisher($publisher->all_text) if $publisher->all_text;
};
-# my $mono = $dom->at('monogr');
-# if ($mono) {
-#
-# # Get title, subtitle, author, editor
-# my $title = $mono->at('h\.title[type=main]');
-# my $sub_title = $mono->at('h\.title[type=sub]');
-# my $author = $mono->at('h\.author');
-# my $editor = $mono->at('editor');
-#
-# $title = $title ? $title->all_text : undef;
-# $sub_title = $sub_title ? $sub_title->all_text : undef;
-# $author = $author ? $author->all_text : undef;
-# $editor = $editor ? $editor->all_text : undef;
-#
-# if ($type eq 'text') {
-# $self->title($title) if $title && !$self->title;
-# $self->sub_title($sub_title) if $sub_title && !$self->sub_title;
-# $self->editor($editor) if $editor && !$self->editor;
-# $self->author($author) if $author && !$self->author;
-# }
-# elsif ($type eq 'doc') {
-# $self->doc_title($title) if $title && !$self->doc_title;
-# $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title;
-# $self->doc_author($author) if $author && !$self->doc_author;
-# $self->doc_editor($editor) if $editor && !$self->doc_editor;
-# }
-# elsif ($type eq 'corpus') {
-# $self->corpus_title($title) if $title && !$self->corpus_title;
-# $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title;
-# $self->corpus_author($author) if $author && !$self->corpus_author;
-# $self->corpus_editor($editor) if $editor && !$self->corpus_editor;
-# };
-# };
-
# Get text type
my $text_desc = $dom->at('textDesc');
@@ -640,19 +606,8 @@
};
};
-# if ($self->author) {
-# foreach (@{$self->author}) {
-# $_ =~ s/\n/ /g;
-# $_ =~ s/\s\s+/ /g;
-# $string .= 'author = ' . $_ . "\n";
-# };
-# };
-
- if ($self->text_class) {
- foreach (@{$self->text_class}) {
- $string .= 'text_class = ' . $_ . "\n";
- };
- };
+ $string .= 'text_class = ' . $self->text_class_string . "\n";
+ $string .= 'keywords = ' . $self->keywords_string . "\n";
return $string;
};
@@ -697,6 +652,17 @@
$hash->{version} = '0.04';
};
+sub to_json {
+ my $self = shift;
+ unless ($self->{tokenizer}) {
+ $self->log->warn('No tokenizer defined');
+ return;
+ };
+
+ return $self->{tokenizer}->to_json;
+};
+
+
1;
@@ -704,52 +670,34 @@
=pod
+=encoding utf8
+
=head1 NAME
-KorAP::XML::Krill
+KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
=head1 SYNOPSIS
+ # Create Converter Object
my $doc = KorAP::XML::Krill->new(
path => 'mydoc-1/'
);
- $doc->parse;
-
- print $doc->title;
+ # Convert to krill json
+ print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
=head1 DESCRIPTION
-Parse the primary and meta data of a document.
+Parse the primary and meta data of a KorAP-XML document.
-=head2 ATTRIBUTES
+=head1 ATTRIBUTES
-=head2 text_sigle
+=head2 log
- $doc->text_sigle(75476);
- print $doc->text_sigle;
-
-The unique identifier of the text.
-
-
-=head2 doc_sigle
-
- $doc->doc_sigle(75476);
- print $doc->doc_sigle;
-
-The unique identifier of the document.
-
-
-=head2 corpus_sigle
-
- $doc->corpus_sigle(4);
- print $doc->corpus_sigle;
-
-The unique identifier of the corpus.
-
+L<Log::Log4perl> object for logging.
=head2 path
@@ -759,39 +707,6 @@
The path of the document.
-=head2 title
-
- $doc->title("Der Name der Rose");
- print $doc->title;
-
-The title of the document.
-
-
-=head2 sub_title
-
- $doc->sub_title("Natürlich eine Handschrift");
- print $doc->sub_title;
-
-The title of the document.
-
-
-=head2 pub_place
-
- $doc->pub_place("Rom");
- print $doc->pub_place;
-
-The publication place of the document.
-
-
-=head2 pub_date
-
- $doc->pub_place("19800404");
- print $doc->pub_place;
-
-The publication date of the document,
-in the format "YYYYMMDD".
-
-
=head2 primary
print $doc->primary->data(0,20);
@@ -799,80 +714,50 @@
The L<KorAP::XML::Document::Primary> object containing the primary data.
-=head2 author
-
- $doc->author('Binks, Jar Jar; Luke Skywalker');
- print $doc->author->[0];
-
-Set the author value as semikolon separated list of names or
-get an array reference of author names.
-
-=head2 text_class
-
- $doc->text_class(qw/news sports/);
- print $doc->text_class->[0];
-
-Set the text class as an array or get an array
-reference of text classes.
-
-
=head1 METHODS
+=head2 annotate
+
+ $doc->add('Mate', 'Morpho');
+
+Add annotation layer to conversion process.
+
+
=head2 parse
- $doc->parse;
+ $doc = $doc->parse;
-Run the parsing process of the document
+Run the meta parsing process of the document.
+=head2 tokenize
+
+ $doc = $doc->tokenize('OpenNLP', 'Tokens');
+
+Accept the tokenization based on a given foundry and a given layer.
+
+
+=head1 AVAILABILITY
+
+ https://github.com/KorAP/KorAP-XML-Krill
+
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Author: L<Nils Diewald|http://nils-diewald.de/>
+
+KorAP::XML::Krill is developed as part of the
+L<KorAP|http://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
+and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
+funded by the
+L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
+
+KorAP::XML::Krill is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
+
=cut
-
-
-Deal with:
- <attribute name="info">
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
- information expressed by the given layer of annotation (there may, and often will, be
- more than one)</documentation>
- <list>
- <oneOrMore>
- <choice>
- <value type="NCName">pos</value>
- <value type="NCName">lemma</value>
- <value type="NCName">msd</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
- the traditional abbreviation for "morphosyntactic description", listing info on
- e.g. tense, person, case, etc.</documentation>
- <value type="NCName">dep</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
- information about types of relations, used in dependency-style annotations; it is
- an indication for the visualiser that word-to-word relationships should be
- displayed</documentation>
- <value type="NCName">lbl</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
- indicates the presence of labels over dependency relations</documentation>
- <value type="NCName">const</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
- stands for 'constituency' or hierarchical, tree-based annotations; it is an
- indication for the visualiser that it should display syntactic
- trees</documentation>
- <value type="NCName">cat</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
- used for syntactic categories, as separate from pos; note that these sets need not
- be disjoint (at the lexical level, they usually overlap), but the frontend prefers
- to keep them separate. 'cat' will be found in the context of chunking or
- hierarchical parsing and will characterise nodes; it may also be found in
- dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
- (the latter are signalled by 'lbl')</documentation>
- <value type="NCName">struct</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
- non-linguistic information (headers, highlights, etc.)</documentation>
- <value type="NCName">frag</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
- >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
- <value type="NCName">ne</value>
- <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
- entities</documentation>
- </choice>
- </oneOrMore>
- </list>
- </attribute>
diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index 1c9ae07..f6f6f5e 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t
@@ -43,7 +43,6 @@
'@:dereko/s:pattern:text$<b>17<s>2',
'Attribute of idsHeader');
-
is($data->{stream}->[4]->[1],
'<>:dereko/s:s$<b>64<i>32<i>42<i>6<b>6<s>1',
'Sentence span');
diff --git a/t/sort_tokens.t b/t/sort_tokens.t
index 1cbfd26..fb9ef7d 100644
--- a/t/sort_tokens.t
+++ b/t/sort_tokens.t
@@ -46,9 +46,9 @@
is($mtt->to_string,
'[(0-5)<>:b=N$<i>0<i>5<i>5|'.
- '<>:f=N$<i>0<i>5<i>6<b>5<b>122|'.
- '<>:e=ADJ$<i>0<i>5<i>6<b>6|'.
- '<>:d=N$<i>0<i>5<i>6<b>7|'.
+ '<>:e=ADJ$<i>0<i>5<i>6<b>6|'.
+ '<>:d=N$<i>0<i>5<i>6<b>7|'.
+ '<>:f=N$<i>0<i>5<i>6<b>5<b>122|'.
'@:i=N$<s>3|'.
'@:h=N$<s>5|'.
'@:j=N$<s>8|'.
diff --git a/t/transform.t b/t/transform.t
index b2620f3..379595e 100644
--- a/t/transform.t
+++ b/t/transform.t
@@ -25,7 +25,7 @@
};
my @layers;
-push(@layers, ['Base', 'Sentences']);
+# push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
# OpenNLP
@@ -35,17 +35,16 @@
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
-push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
-push(@layers, ['DeReKo', 'Structure']);
+# push(@layers, ['DeReKo', 'Structure']);
-push(@layers, ['Glemm', 'Morpho']);
+# push(@layers, ['Glemm', 'Morpho']);
-push(@layers, ['Mate', 'Morpho']);
+# push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
# Connexor
push(@layers, ['Connexor', 'Morpho']);
@@ -53,13 +52,16 @@
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
+
+# TODO: OpenNLP
+
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# Mate
-push(@layers, ['Mate', 'Morpho']);
-push(@layers, ['Mate', 'Dependency']);
+#push(@layers, ['Mate', 'Morpho']);
+#push(@layers, ['Mate', 'Dependency']);
# XIP
push(@layers, ['XIP', 'Morpho']);