Support non-word-tokens (fixes #5)
Change-Id: I6867745afd7c0fb865722bcd62a0724aaa9a6ccb
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 772e430..f1ba9d4 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -129,9 +129,12 @@
#
# 2018/07/19
# - Preliminary support for HNC.
+#
+# 2019/01/22
+# - Support for non-word tokens.
# ----------------------------------------------------------
-our $LAST_CHANGE = '2018/07/19';
+our $LAST_CHANGE = '2019/01/22';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
@@ -175,6 +178,7 @@
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs),
'to-tar' => \(my $to_tar),
+ 'non-word-tokens|nwt' => \(my $non_word_tokens),
'sequential-extraction|se' => \(my $sequential_extraction),
'cache-size|cs=s' => \(my $cache_size),
'cache-delete|cd!' => \(my $cache_delete),
@@ -233,6 +237,11 @@
$token_base = $config{token};
};
+ # temporary-extract
+ if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
+ $non_word_tokens = $config{'non-word-tokens'};
+ };
+
# Cache file
if (!defined($cache_file) && defined $config{cache}) {
$cache_file = $config{cache};
@@ -322,6 +331,7 @@
$base_sentences //= '';
$base_paragraphs //= '';
$base_pagebreaks //= '';
+$non_word_tokens //= 0;
$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
@@ -548,7 +558,8 @@
log => $log,
primary => $primary,
pretty => $pretty,
- anno => \@filtered_anno
+ anno => \@filtered_anno,
+ non_word_tokens => $non_word_tokens
);
# Get file name based on path information
@@ -1221,6 +1232,13 @@
This is I<deprecated>.
+=item B<--non-word-tokens|-nwt>
+
+Tokenize non-word tokens like word tokens (defined as matching
+C</[\d\w]/>). Useful to treat punctuations as tokens.
+
+ Defaults to unset.
+
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
@@ -1435,7 +1453,7 @@
=head1 COPYRIGHT AND LICENSE
-Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
+Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>