Support non-word-tokens (fixes #5)

Change-Id: I6867745afd7c0fb865722bcd62a0724aaa9a6ccb
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 772e430..f1ba9d4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -129,9 +129,12 @@
 #
 # 2018/07/19
 # - Preliminary support for HNC.
+#
+# 2019/01/22
+# - Support for non-word tokens.
 # ----------------------------------------------------------
 
-our $LAST_CHANGE = '2018/07/19';
+our $LAST_CHANGE = '2019/01/22';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -175,6 +178,7 @@
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
   'to-tar'      => \(my $to_tar),
+  'non-word-tokens|nwt' => \(my $non_word_tokens),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
@@ -233,6 +237,11 @@
     $token_base = $config{token};
   };
 
+  # temporary-extract
+  if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
+    $non_word_tokens = $config{'non-word-tokens'};
+  };
+
   # Cache file
   if (!defined($cache_file) && defined $config{cache}) {
     $cache_file = $config{cache};
@@ -322,6 +331,7 @@
 $base_sentences      //= '';
 $base_paragraphs     //= '';
 $base_pagebreaks     //= '';
+$non_word_tokens     //= 0;
 
 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
@@ -548,7 +558,8 @@
   log       => $log,
   primary   => $primary,
   pretty    => $pretty,
-  anno      => \@filtered_anno
+  anno      => \@filtered_anno,
+  non_word_tokens => $non_word_tokens
 );
 
 # Get file name based on path information
@@ -1221,6 +1232,13 @@
 This is I<deprecated>.
 
 
+=item B<--non-word-tokens|-nwt>
+
+Tokenize non-word tokens like word tokens (defined as matching
+C</[\d\w]/>). Useful to treat punctuations as tokens.
+
+ Defaults to unset.
+
 =item B<--jobs|-j>
 
 Define the number of concurrent jobs in seperated forks
@@ -1435,7 +1453,7 @@
 
 =head1 COPYRIGHT AND LICENSE
 
-Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
 
 Author: L<Nils Diewald|http://nils-diewald.de/>