Improve emoji detection and support
Change-Id: Ie245920a51d64e934522a26eb378d350d69f5a98
diff --git a/t/tokens.t b/t/tokens.t
index 319a02e..780b6ca 100644
--- a/t/tokens.t
+++ b/t/tokens.t
@@ -4,6 +4,8 @@
use Test::More;
use Benchmark ':hireswallclock';
use lib 'lib', '../lib';
+use File::Basename 'dirname';
+use File::Spec::Functions 'catdir';
use_ok('KorAP::XML::Index::MultiTerm');
@@ -90,6 +92,29 @@
q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/),
q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/);
+
+# Create emoji path relative to test file
+my $emoji_file = catdir(dirname(__FILE__), 'real','all_emojis.txt');
+
+# Init test values
+my ($ok, $fail) = (0, 0);
+
+# Test all emojis line by line
+open(in_file,"<:encoding(utf8)",$emoji_file) or die("Could not open emoji file.");
+while(<in_file>){
+ chomp $_;
+ if (KorAP::XML::Tokenizer::is_emoji($_)) {
+ $ok++;
+ } else {
+ $fail++;
+ }
+};
+close(in_file);
+
+# Check emojis for regressions
+ok($ok >= 2036, "Emojis fine");
+ok($fail <= 1746, "Emojis fine");
+
no utf8;
done_testing;