Fixed sorting Change-Id: Ifdd820f8788271bb94047367117bf573fad62136

commit: 2ea505a48601d578b2fefa6160928cac49f5a839 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 05 20:49:03 2016 +0100
committer: Akron <nils@diewald-online.de> Fri Feb 05 20:49:03 2016 +0100
tree: 185300b3a4adb34ee829ffa252bf19a49f4194fe
parent: 9a04c714a9a068e8734ffe1c408ab5b7529049fa [diff]
diff --git a/MANIFEST b/MANIFEST
index b37db9c..227d839 100755
--- a/MANIFEST
+++ b/MANIFEST

@@ -31,6 +31,8 @@
 lib/KorAP/XML/Index/OpenNLP/Morpho.pm
 lib/KorAP/XML/Index/OpenNLP/Sentences.pm
 lib/KorAP/XML/Index/OpenNLP/Morpho.pm
+lib/KorAP/XML/Index/Sgbr/Morpho.pm
+lib/KorAP/XML/Index/Sgbr/Lemma.pm
 lib/KorAP/XML/Index/TreeTagger/Morpho.pm
 lib/KorAP/XML/Index/TreeTagger/Sentences.pm
 lib/KorAP/XML/Index/XIP/Constituency.pm
@@ -636,7 +638,9 @@
 t/corpus/WDD/G27/38989/mate/pipeline/one_token_per_line.txt
 t/corpus/WDD/G27/38989/mate/pipeline/parsed.txt
 t/corpus/WDD/G27/38989/mate/tokenSpans/number_tokenSpans.xml
+script/korapxml2krill
+script/korapxml2krill_dir
 Changes
-MANIFEST
 LICENSE
-Makefile.PL
\ No newline at end of file
+Makefile.PL
+MANIFEST
\ No newline at end of file

diff --git a/lib/KorAP/XML/Field/MultiTermToken.pm b/lib/KorAP/XML/Field/MultiTermToken.pm
index 5019e2f..52c3d20 100644
--- a/lib/KorAP/XML/Field/MultiTermToken.pm
+++ b/lib/KorAP/XML/Field/MultiTermToken.pm

@@ -121,10 +121,7 @@
     # Order attributes by reference id
     if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
 
-
-#      use Data::Dumper;
-#      die Dumper $a;
-
+      # Check TUI
       my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
       my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
       if ($a_id > $b_id) {
@@ -198,8 +195,8 @@
 
     # Check depth
     else {
-      my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)$/ : 0);
-      my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)$/ : 0);
+      my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
+      my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
 
       $a_depth //= 0;
       $b_depth //= 0;

diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 4453274..b2af20c 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm

@@ -456,40 +456,6 @@
     $self->publisher($publisher->all_text) if $publisher->all_text;
   };
 
-#  my $mono = $dom->at('monogr');
-#  if ($mono) {
-#
-#    # Get title, subtitle, author, editor
-#    my $title     = $mono->at('h\.title[type=main]');
-#    my $sub_title = $mono->at('h\.title[type=sub]');
-#    my $author    = $mono->at('h\.author');
-#    my $editor    = $mono->at('editor');
-#
-#    $title     = $title     ? $title->all_text     : undef;
-#    $sub_title = $sub_title ? $sub_title->all_text : undef;
-#    $author    = $author    ? $author->all_text    : undef;
-#    $editor    = $editor    ? $editor->all_text    : undef;
-#
-#    if ($type eq 'text') {
-#      $self->title($title)         if $title && !$self->title;
-#      $self->sub_title($sub_title) if $sub_title && !$self->sub_title;
-#      $self->editor($editor)       if $editor && !$self->editor;
-#      $self->author($author)       if $author && !$self->author;
-#    }
-#    elsif ($type eq 'doc') {
-#      $self->doc_title($title)         if $title && !$self->doc_title;
-#      $self->doc_sub_title($sub_title) if $sub_title && !$self->doc_sub_title;
-#      $self->doc_author($author)       if $author && !$self->doc_author;
-#      $self->doc_editor($editor)       if $editor && !$self->doc_editor;
-#    }
-#    elsif ($type eq 'corpus') {
-#      $self->corpus_title($title)         if $title && !$self->corpus_title;
-#      $self->corpus_sub_title($sub_title) if $sub_title && !$self->corpus_sub_title;
-#      $self->corpus_author($author)       if $author && !$self->corpus_author;
-#      $self->corpus_editor($editor)       if $editor && !$self->corpus_editor;
-#    };
-#  };
-
   # Get text type
   my $text_desc = $dom->at('textDesc');
 
@@ -640,19 +606,8 @@
     };
   };
 
-#  if ($self->author) {
-#    foreach (@{$self->author}) {
-#      $_ =~ s/\n/ /g;
-#      $_ =~ s/\s\s+/ /g;
-#      $string .= 'author = ' . $_ . "\n";
-#    };
-#  };
-
-  if ($self->text_class) {
-    foreach (@{$self->text_class}) {
-      $string .= 'text_class = ' . $_ . "\n";
-    };
-  };
+  $string .= 'text_class = ' . $self->text_class_string . "\n";
+  $string .= 'keywords = ' . $self->keywords_string . "\n";
 
   return $string;
 };
@@ -697,6 +652,17 @@
   $hash->{version} = '0.04';
 };
 
+sub to_json {
+  my $self = shift;
+  unless ($self->{tokenizer}) {
+    $self->log->warn('No tokenizer defined');
+    return;
+  };
+
+  return $self->{tokenizer}->to_json;
+};
+
+
 1;
 
 
@@ -704,52 +670,34 @@
 
 =pod
 
+=encoding utf8
+
 =head1 NAME
 
-KorAP::XML::Krill
+KorAP::XML::Krill - Preprocess KorAP XML documents for Krill
 
 
 =head1 SYNOPSIS
 
+  # Create Converter Object
   my $doc = KorAP::XML::Krill->new(
     path => 'mydoc-1/'
   );
 
-  $doc->parse;
-
-  print $doc->title;
+  # Convert to krill json
+  print $doc->parse->tokenize->annotate('Mate', 'Morpho')->to_json;
 
 
 =head1 DESCRIPTION
 
-Parse the primary and meta data of a document.
+Parse the primary and meta data of a KorAP-XML document.
 
 
-=head2 ATTRIBUTES
+=head1 ATTRIBUTES
 
-=head2 text_sigle
+=head2 log
 
-  $doc->text_sigle(75476);
-  print $doc->text_sigle;
-
-The unique identifier of the text.
-
-
-=head2 doc_sigle
-
-  $doc->doc_sigle(75476);
-  print $doc->doc_sigle;
-
-The unique identifier of the document.
-
-
-=head2 corpus_sigle
-
-  $doc->corpus_sigle(4);
-  print $doc->corpus_sigle;
-
-The unique identifier of the corpus.
-
+L<Log::Log4perl> object for logging.
 
 =head2 path
 
@@ -759,39 +707,6 @@
 The path of the document.
 
 
-=head2 title
-
-  $doc->title("Der Name der Rose");
-  print $doc->title;
-
-The title of the document.
-
-
-=head2 sub_title
-
-  $doc->sub_title("Natürlich eine Handschrift");
-  print $doc->sub_title;
-
-The title of the document.
-
-
-=head2 pub_place
-
-  $doc->pub_place("Rom");
-  print $doc->pub_place;
-
-The publication place of the document.
-
-
-=head2 pub_date
-
-  $doc->pub_place("19800404");
-  print $doc->pub_place;
-
-The publication date of the document,
-in the format "YYYYMMDD".
-
-
 =head2 primary
 
   print $doc->primary->data(0,20);
@@ -799,80 +714,50 @@
 The L<KorAP::XML::Document::Primary> object containing the primary data.
 
 
-=head2 author
-
-  $doc->author('Binks, Jar Jar; Luke Skywalker');
-  print $doc->author->[0];
-
-Set the author value as semikolon separated list of names or
-get an array reference of author names.
-
-=head2 text_class
-
-  $doc->text_class(qw/news sports/);
-  print $doc->text_class->[0];
-
-Set the text class as an array or get an array
-reference of text classes.
-
-
 =head1 METHODS
 
+=head2 annotate
+
+  $doc->add('Mate', 'Morpho');
+
+Add annotation layer to conversion process.
+
+
 =head2 parse
 
-  $doc->parse;
+  $doc = $doc->parse;
 
-Run the parsing process of the document
+Run the meta parsing process of the document.
 
 
+=head2 tokenize
+
+  $doc = $doc->tokenize('OpenNLP', 'Tokens');
+
+Accept the tokenization based on a given foundry and a given layer.
+
+
+=head1 AVAILABILITY
+
+  https://github.com/KorAP/KorAP-XML-Krill
+
+
+=head1 COPYRIGHT AND LICENSE
+
+Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Author: L<Nils Diewald|http://nils-diewald.de/>
+
+KorAP::XML::Krill is developed as part of the
+L<KorAP|http://korap.ids-mannheim.de/>
+Corpus Analysis Platform at the
+L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
+member of the
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>
+and supported by the L<KobRA|http://www.kobra.tu-dortmund.de> project,
+funded by the
+L<Federal Ministry of Education and Research (BMBF)|http://www.bmbf.de/en/>.
+
+KorAP::XML::Krill is free software published under the
+L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
+
 =cut
-
-
-Deal with:
-        <attribute name="info">
-          <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">kind of
-            information expressed by the given layer of annotation (there may, and often will, be
-            more than one)</documentation>
-          <list>
-            <oneOrMore>
-              <choice>
-                <value type="NCName">pos</value>
-                <value type="NCName">lemma</value>
-                <value type="NCName">msd</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'msd' is
-                  the traditional abbreviation for "morphosyntactic description", listing info on
-                  e.g. tense, person, case, etc.</documentation>
-                <value type="NCName">dep</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'dep' is
-                  information about types of relations, used in dependency-style annotations; it is
-                  an indication for the visualiser that word-to-word relationships should be
-                  displayed</documentation>
-                <value type="NCName">lbl</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'lbl'
-                  indicates the presence of labels over dependency relations</documentation>
-                <value type="NCName">const</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'const'
-                  stands for 'constituency' or hierarchical, tree-based annotations; it is an
-                  indication for the visualiser that it should display syntactic
-                  trees</documentation>
-                <value type="NCName">cat</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">'cat' is
-                  used for syntactic categories, as separate from pos; note that these sets need not
-                  be disjoint (at the lexical level, they usually overlap), but the frontend prefers
-                  to keep them separate. 'cat' will be found in the context of chunking or
-                  hierarchical parsing and will characterise nodes; it may also be found in
-                  dependency annotations, to indicate labels on nodes, as opposed to labels on arcs
-                  (the latter are signalled by 'lbl')</documentation>
-                <value type="NCName">struct</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">all
-                  non-linguistic information (headers, highlights, etc.)</documentation>
-                <value type="NCName">frag</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0"
-                  >non-exhaustive coverage (when spanList/@fragmented="true")</documentation>
-                <value type="NCName">ne</value>
-                <documentation xmlns="http://relaxng.org/ns/compatibility/annotations/1.0">named
-                  entities</documentation>
-              </choice>
-            </oneOrMore>
-          </list>
-        </attribute>

diff --git a/t/index/dereko_struct.t b/t/index/dereko_struct.t
index 1c9ae07..f6f6f5e 100644
--- a/t/index/dereko_struct.t
+++ b/t/index/dereko_struct.t

@@ -43,7 +43,6 @@
    '@:dereko/s:pattern:text$<b>17<s>2',
    'Attribute of idsHeader');
 
-
 is($data->{stream}->[4]->[1],
    '<>:dereko/s:s$<b>64<i>32<i>42<i>6<b>6<s>1',
    'Sentence span');

diff --git a/t/sort_tokens.t b/t/sort_tokens.t
index 1cbfd26..fb9ef7d 100644
--- a/t/sort_tokens.t
+++ b/t/sort_tokens.t

@@ -46,9 +46,9 @@
 
 is($mtt->to_string,
    '[(0-5)<>:b=N$<i>0<i>5<i>5|'.
-     '<>:f=N$<i>0<i>5<i>6<b>5<b>122|'.
-       '<>:e=ADJ$<i>0<i>5<i>6<b>6|'.
-	 '<>:d=N$<i>0<i>5<i>6<b>7|'.
+     '<>:e=ADJ$<i>0<i>5<i>6<b>6|'.
+       '<>:d=N$<i>0<i>5<i>6<b>7|'.
+	 '<>:f=N$<i>0<i>5<i>6<b>5<b>122|'.
 	   '@:i=N$<s>3|'.
 	     '@:h=N$<s>5|'.
 	       '@:j=N$<s>8|'.

diff --git a/t/transform.t b/t/transform.t
index b2620f3..379595e 100644
--- a/t/transform.t
+++ b/t/transform.t

@@ -25,7 +25,7 @@
 };
 
 my @layers;
-push(@layers, ['Base', 'Sentences']);
+# push(@layers, ['Base', 'Sentences']);
 push(@layers, ['Base', 'Paragraphs']);
 
 # OpenNLP
@@ -35,17 +35,16 @@
 # CoreNLP
 push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
 push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
-push(@layers, ['CoreNLP', 'NamedEntities']);
 push(@layers, ['CoreNLP', 'Sentences']);
 
-push(@layers, ['DeReKo', 'Structure']);
+# push(@layers, ['DeReKo', 'Structure']);
 
-push(@layers, ['Glemm', 'Morpho']);
+# push(@layers, ['Glemm', 'Morpho']);
 
-push(@layers, ['Mate', 'Morpho']);
+# push(@layers, ['Mate', 'Morpho']);
 push(@layers, ['Mate', 'Dependency']);
 
-push(@layers, ['Malt', 'Dependency']);
+# push(@layers, ['Malt', 'Dependency']);
 
 # Connexor
 push(@layers, ['Connexor', 'Morpho']);
@@ -53,13 +52,16 @@
 push(@layers, ['Connexor', 'Phrase']);
 push(@layers, ['Connexor', 'Sentences']);
 
+
+# TODO: OpenNLP
+
 # TreeTagger
 push(@layers, ['TreeTagger', 'Morpho']);
 push(@layers, ['TreeTagger', 'Sentences']);
 
 # Mate
-push(@layers, ['Mate', 'Morpho']);
-push(@layers, ['Mate', 'Dependency']);
+#push(@layers, ['Mate', 'Morpho']);
+#push(@layers, ['Mate', 'Dependency']);
 
 # XIP
 push(@layers, ['XIP', 'Morpho']);
commit	2ea505a48601d578b2fefa6160928cac49f5a839	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 05 20:49:03 2016 +0100
committer	Akron <nils@diewald-online.de>	Fri Feb 05 20:49:03 2016 +0100
tree	185300b3a4adb34ee829ffa252bf19a49f4194fe
parent	9a04c714a9a068e8734ffe1c408ab5b7529049fa [diff]