Support non-verbal annotations Change-Id: I6cc0e7c8279f523d3c4b66b14125866ec0be1695

commit: f1849aa25d77eb716e539e3b66c11fa282d40e30 [log] [tgz]
author: Akron <nils@diewald-online.de> Mon Dec 16 23:35:33 2019 +0100
committer: Akron <nils@diewald-online.de> Tue Dec 17 08:15:59 2019 +0100
tree: 1f92020f3ddf09ccf9a149d90bc579fd9fbae57c
parent: c29b8e1239bf241123db2b231681b428b62fc450 [diff] [blame]
diff --git a/script/korapxml2krill b/script/korapxml2krill
index e6754d9..56189aa 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill

@@ -143,12 +143,13 @@
 # 2019/08/08
 # - Support for Talismane.
 #
-# 2019/12/16
+# 2019/12/17
 # - Added support for DGD pseudo-sentences
 #   based on anchor milestones.
+# - Support for non-verbal annotations.
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2019/12/16';
+our $LAST_CHANGE = '2019/12/17';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
@@ -195,6 +196,7 @@
   'koral|k=f'    => \(my $koral),
   'to-tar'      => \(my $to_tar),
   'non-word-tokens|nwt' => \(my $non_word_tokens),
+  'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
@@ -258,11 +260,16 @@
     $token_base = $config{token};
   };
 
-  # temporary-extract
+  # Non-word tokenization
   if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
     $non_word_tokens = $config{'non-word-tokens'};
   };
 
+  # Non-verbal tokenization
+  if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
+    $non_verbal_tokens = $config{'non-verbal-tokens'};
+  };
+
   # Cache file
   if (!defined($cache_file) && defined $config{cache}) {
     $cache_file = $config{cache};
@@ -354,6 +361,7 @@
 $base_paragraphs     //= '';
 $base_pagebreaks     //= '';
 $non_word_tokens     //= 0;
+$non_verbal_tokens   //= 0;
 
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
@@ -592,7 +600,8 @@
   primary   => $primary,
   pretty    => $pretty,
   anno      => \@filtered_anno,
-  non_word_tokens => $non_word_tokens
+  non_word_tokens => $non_word_tokens,
+  non_verbal_tokens => $non_verbal_tokens
 );
 
 # Get file name based on path information
@@ -1228,6 +1237,8 @@
 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
+This will directly take the file instead of running
+the layer implementation!
 
 
 =item B<--base-sentences|-bs> <foundry>#<layer>
@@ -1287,6 +1298,15 @@
 
  Defaults to unset.
 
+
+=item B<--non-verbal-tokens|-nvt>
+
+Tokenize non-verbal tokens marked as in the primary data as 
+the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
+
+ Defaults to unset.
+
+
 =item B<--jobs|-j>
 
 Define the number of concurrent jobs in seperated forks
@@ -1537,7 +1557,7 @@
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
-L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
+L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
 
 This program is free software published under the
 L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
commit	f1849aa25d77eb716e539e3b66c11fa282d40e30	[log] [tgz]
author	Akron <nils@diewald-online.de>	Mon Dec 16 23:35:33 2019 +0100
committer	Akron <nils@diewald-online.de>	Tue Dec 17 08:15:59 2019 +0100
tree	1f92020f3ddf09ccf9a149d90bc579fd9fbae57c
parent	c29b8e1239bf241123db2b231681b428b62fc450 [diff] [blame]