Revert "Added regular expression for emojis that leads to agd.t failing"
This reverts commit 0c99f3b4eb5eafd4fb6f464b0fc74a30336856e6.
Change-Id: I3fe04039624d3ab6d038c1952e85f80cb98da84c
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 171b1c1..482bcf7 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -16,7 +16,6 @@
use List::MoreUtils 'uniq';
use JSON::XS;
use Log::Any qw($log);
-use utf8;
# TODO 1:
# Bei den Autoren im Index darauf achten,
@@ -147,15 +146,8 @@
# Ignore non-word, non-number, and non-verbal tokens per default
# '9646' equals the musical pause, used in speech corpora
if ($self->non_verbal_tokens && ord($token) == 9646) {
- # Non-verbal token
- }
- # elsif ($token eq '😍') {
- #;
- #}
- elsif ($token =~ m{\p{Emoji}}){
- ;
- }
- elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
+ # Non-verbal token
+ } elsif (!$self->non_word_tokens && $token !~ /[\w\d]/) {
# TODO: Recognize punctuations!
# if ($mtt) {
# my $term = [$token, $from, $to];
diff --git a/t/real/ndy.t b/t/real/ndy.t
index c0f5dc7..e6ce10a 100644
--- a/t/real/ndy.t
+++ b/t/real/ndy.t
@@ -66,22 +66,5 @@
ok(!$meta->{A_doc_editor}, 'Correct doc editor');
-# Tokenization
-use_ok('KorAP::XML::Tokenizer');
-
-my ($token_base_foundry, $token_base_layer) = (qw/Base Tokens/);
-
-# Get tokenization
-my $tokens = KorAP::XML::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => $token_base_foundry,
- layer => $token_base_layer,
- name => 'tokens'
-);
-
-ok($tokens, 'Token Object is fine');
-ok($tokens->parse, 'Token parsing is fine');
-
done_testing;
__END__