Added regular expression for emojis that leads to agd.t failing

Change-Id: Iafbf8961e55151a7a5c9a11d186e591cde1c1359
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 482bcf7..171b1c1 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -16,6 +16,7 @@
 use List::MoreUtils 'uniq';
 use JSON::XS;
 use Log::Any qw($log);
+use utf8;
 
 # TODO 1:
 # Bei den Autoren im Index darauf achten,
@@ -146,8 +147,15 @@
     # Ignore non-word, non-number, and non-verbal tokens per default
     # '9646' equals the musical pause, used in speech corpora
     if ($self->non_verbal_tokens && ord($token) == 9646) {
-      # Non-verbal token
-    } elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
+	# Non-verbal token
+    }
+    # elsif ($token eq '😍') {
+	#;
+    #}
+    elsif ($token =~ m{\p{Emoji}}){
+	;
+    }
+    elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
       # TODO: Recognize punctuations!
       #  if ($mtt) {
       #    my $term = [$token, $from, $to];
diff --git a/t/real/ndy.t b/t/real/ndy.t
index e6ce10a..c0f5dc7 100644
--- a/t/real/ndy.t
+++ b/t/real/ndy.t
@@ -66,5 +66,22 @@
 ok(!$meta->{A_doc_editor}, 'Correct doc editor');
 
 
+# Tokenization
+use_ok('KorAP::XML::Tokenizer');
+
+my ($token_base_foundry, $token_base_layer) = (qw/Base Tokens/);
+
+# Get tokenization
+my $tokens = KorAP::XML::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => $token_base_foundry,
+  layer => $token_base_layer,
+  name => 'tokens'
+);
+
+ok($tokens, 'Token Object is fine');
+ok($tokens->parse, 'Token parsing is fine');
+
 done_testing;
 __END__