Improve utf-8-preprocessing for tokenizers
Change-Id: If18ab6d40b98d372f161a377a0d4efd17444fa69
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 63a9ce0..5099eeb 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -8,28 +8,18 @@
# Tokenize string "aggressively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt_utf8) = @_;
+ my ($self, $txt) = @_;
- my $txt;
-
- # faster processing of UTF8-chars
- foreach my $char (split //, $txt_utf8) {
- if ($char =~ /\p{Punct}/) {
- $txt .= "p"
- } elsif ($char =~ /[^\p{Punct}\s]/) {
- $txt .= "P"
- } elsif ($char =~ /\s/) {
- $txt .= "s"
- } else {
- $txt .= "o" # other: should actually only happen for string end (0 byte)
- # check could be 'ord($char)==0'
- }
- };
+ # Replace MBCs with single bytes
+ $txt =~ s/\p{Punct}/./g;
+ $txt =~ s/\s/~/g;
+ $txt =~ s/[^\.\~]/_/g;
+ utf8::downgrade($txt);
# Iterate over the whole string
- while ($txt =~ /(P+)
- (?:(p)|s?)|
- (p)/gx){
+ while ($txt =~ /(_+)
+ (?:(\.)|\~?)|
+ (\.)/gx){
# Starts with a character sequence
if (defined $1){
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index c89c4fd..0efb648 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -8,29 +8,19 @@
# Tokenize string "conservatively" and return an array
# with character boundaries.
sub tokenize {
- my ($self, $txt_utf8) = @_;
+ my ($self, $txt) = @_;
- my $txt;
-
- # faster processing of UTF8-chars
- foreach my $char (split //, $txt_utf8) {
- if ($char =~ /\p{Punct}/) {
- $txt .= "p"
- } elsif ($char =~ /[^\p{Punct}\s]/) {
- $txt .= "P"
- } elsif ($char =~ /\s/) {
- $txt .= "s"
- } else {
- $txt .= "o" # other: should actually only happen for string end (0 byte)
- # check could be 'ord($char)==0'
- }
- };
+ # Replace MBCs with single bytes
+ $txt =~ s/\p{Punct}/./g;
+ $txt =~ s/\s/~/g;
+ $txt =~ s/[^\.\~]/_/g;
+ utf8::downgrade($txt);
# Iterate over the whole string
- while ($txt =~ /(p*)
- (P+(?:p+P+)*)?
- (p*)
- s?/gx) {
+ while ($txt =~ /(\.*)
+ (_+(?:\.+_+)*)?
+ (\.*)
+ \~?/gx) {
# Punctuation preceding a token
$self->_add_surroundings($txt, $-[1], $+[1], 1) if $1;
@@ -54,6 +44,7 @@
my $pr; # "print" (tokenize) punctuation character (if one of the below tests justified it)
if ($p2 == $p1+1) { # single punctuation character
+ my $char;
# Variant for preceding characters
if ($preceding) {
@@ -62,18 +53,24 @@
# Punctuation character doesn't start at first position
if ($p1 != 0) {
+
# Check char before punctuation char
- $pr = ( substr( $txt, $p1-1, 1 ) =~ /[ps]/ );
+ $char = substr( $txt, $p1-1, 1 );
+ $pr = ($char eq '.' || $char eq '~') ? 1 : 0;
}
}
else {
# Check char after punctuation char
- $pr = ( substr( $txt, $p2, 1 ) =~ /[ps]?/ ); # the last punctuation character should always be tokenized (signified by the ?)
+ $char = substr( $txt, $p2, 1 );
+
+ # The last punctuation character should always be tokenized
+ $pr = (!$char || $char eq '.' || $char eq '~') ? 1 : 0;
# Check char before punctuation char
unless ($pr) {
- $pr = ( substr ( $txt, $p1-1, 1 ) =~ /[ps]/ );
+ $char = substr ( $txt, $p1-1, 1);
+ $pr = ($char eq '.' || $char eq '~' ) ? 1 : 0;
};
};