Added regular expression for emojis that leads to agd.t failing
Change-Id: Iafbf8961e55151a7a5c9a11d186e591cde1c1359
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 482bcf7..171b1c1 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -16,6 +16,7 @@
use List::MoreUtils 'uniq';
use JSON::XS;
use Log::Any qw($log);
+use utf8;
# TODO 1:
# Bei den Autoren im Index darauf achten,
@@ -146,8 +147,15 @@
# Ignore non-word, non-number, and non-verbal tokens per default
# '9646' equals the musical pause, used in speech corpora
if ($self->non_verbal_tokens && ord($token) == 9646) {
- # Non-verbal token
- } elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
+ # Non-verbal token
+ }
+ # elsif ($token eq '😍') {
+ #;
+ #}
+ elsif ($token =~ m{\p{Emoji}}){
+ ;
+ }
+ elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
# TODO: Recognize punctuations!
# if ($mtt) {
# my $term = [$token, $from, $to];
diff --git a/t/real/ndy.t b/t/real/ndy.t
index e6ce10a..c0f5dc7 100644
--- a/t/real/ndy.t
+++ b/t/real/ndy.t
@@ -66,5 +66,22 @@
ok(!$meta->{A_doc_editor}, 'Correct doc editor');
+# Tokenization
+use_ok('KorAP::XML::Tokenizer');
+
+my ($token_base_foundry, $token_base_layer) = (qw/Base Tokens/);
+
+# Get tokenization
+my $tokens = KorAP::XML::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+
+ok($tokens, 'Token Object is fine');
+ok($tokens->parse, 'Token parsing is fine');
+
done_testing;
__END__