Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 1 | use strict; |
| 2 | use warnings; |
| 3 | use utf8; |
| 4 | use Test::More; |
| 5 | use Benchmark ':hireswallclock'; |
| 6 | use lib 'lib', '../lib'; |
Akron | 2daf8fe | 2023-02-27 12:55:04 +0100 | [diff] [blame] | 7 | use File::Basename 'dirname'; |
| 8 | use File::Spec::Functions 'catdir'; |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 9 | |
Akron | afb81ad | 2016-08-01 20:28:31 +0200 | [diff] [blame] | 10 | use_ok('KorAP::XML::Index::MultiTerm'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 11 | |
Akron | 129e441 | 2020-08-05 15:30:12 +0200 | [diff] [blame] | 12 | ok(my $term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object'); |
| 13 | $term->set_p_start(0); |
| 14 | $term->set_p_end(56); |
| 15 | $term->set_payload('<i>56'); |
| 16 | $term->set_o_start(34); |
| 17 | $term->set_o_end(120); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 18 | |
Akron | 72e671f | 2020-08-04 11:35:40 +0200 | [diff] [blame] | 19 | is($term->get_term, 'Baum'); |
| 20 | is($term->get_p_start, 0); |
| 21 | is($term->get_p_end, 56); |
| 22 | is($term->get_o_start, 34); |
| 23 | is($term->get_o_end, 120); |
| 24 | is($term->get_payload, '<i>56'); |
Akron | 9c0488f | 2016-01-28 14:17:15 +0100 | [diff] [blame] | 25 | is($term->to_string, 'Baum$<i>34<i>120<i>56<i>56'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 26 | |
Akron | 129e441 | 2020-08-05 15:30:12 +0200 | [diff] [blame] | 27 | ok($term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 28 | |
Akron | 72e671f | 2020-08-04 11:35:40 +0200 | [diff] [blame] | 29 | is($term->get_term, 'Baum'); |
| 30 | is($term->get_p_start, 0); |
| 31 | is($term->get_p_end, 0); |
| 32 | is($term->get_o_start, 0); |
| 33 | is($term->get_o_end, 0); |
| 34 | is($term->get_payload, undef); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 35 | is($term->to_string, 'Baum'); |
| 36 | |
Akron | 129e441 | 2020-08-05 15:30:12 +0200 | [diff] [blame] | 37 | ok($term = KorAP::XML::Index::MultiTerm->new('Ba#um'), 'Create new object'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 38 | |
Akron | 72e671f | 2020-08-04 11:35:40 +0200 | [diff] [blame] | 39 | is($term->get_term, 'Ba#um'); |
| 40 | is($term->get_p_start, 0); |
| 41 | is($term->get_p_end, 0); |
| 42 | is($term->get_o_start, 0); |
| 43 | is($term->get_o_end, 0); |
| 44 | is($term->get_payload, undef); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 45 | is($term->to_string, 'Ba\#um'); |
| 46 | |
Akron | 129e441 | 2020-08-05 15:30:12 +0200 | [diff] [blame] | 47 | ok($term = KorAP::XML::Index::MultiTerm->new('Ba#u$m'), 'Create new object'); |
| 48 | $term->set_payload('<i>45'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 49 | |
Akron | 72e671f | 2020-08-04 11:35:40 +0200 | [diff] [blame] | 50 | is($term->get_term, 'Ba#u$m'); |
| 51 | is($term->get_p_start, 0); |
| 52 | is($term->get_p_end, 0); |
| 53 | is($term->get_o_start, 0); |
| 54 | is($term->get_o_end, 0); |
| 55 | is($term->get_payload, '<i>45'); |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 56 | is($term->to_string, 'Ba\#u\$m$<i>45'); |
| 57 | |
Akron | 55778f0 | 2017-03-14 20:47:26 +0100 | [diff] [blame] | 58 | use_ok('KorAP::XML::Tokenizer'); |
| 59 | |
| 60 | use utf8; |
| 61 | sub remove_diacritics { KorAP::XML::Tokenizer::remove_diacritics(@_) }; |
| 62 | |
| 63 | is(remove_diacritics('äöü'), 'aou', 'Remove diacritics'); |
| 64 | |
| 65 | is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics'); |
| 66 | is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics'); |
| 67 | |
| 68 | # From comment in http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html |
| 69 | is(remove_diacritics('ÅåÄäÖö'), 'AaAaOo', 'Check swedish'); |
| 70 | # Krawfish::Util::String::_list_props('Łł'); |
| 71 | is(remove_diacritics('ĄąĆćĘꣳŃńÓ󌜏źŻż'), 'AaCcEeLlNnOoSsZzZz', 'Check polish'); |
| 72 | is(remove_diacritics('ľščťžýáíéúäôň*ȍŽÝÁÍÉÚÄÔŇĎ'), 'lsctzyaieuaondLSCTZYAIEUAOND', 'Check slowakish'); |
| 73 | is(remove_diacritics('ëőüűŐÜŰ'), 'eouuOUU', 'Check hungarian'); |
| 74 | is(remove_diacritics('Ññ¿'), 'Nn¿', 'Check spanish'); |
| 75 | is(remove_diacritics('àèòçï'), 'aeoci', 'Check CA?'); |
| 76 | is(remove_diacritics('ı'), 'i', 'Check turkish'); |
| 77 | |
| 78 | # From http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126 |
| 79 | is(remove_diacritics('äáčďěéíľľňôóřŕšťúůýž'), 'aacdeeillnoorrstuuyz'); |
| 80 | is(remove_diacritics('ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ'), 'AACDEEILLNOORRSTUUYZ'); |
| 81 | is(remove_diacritics('ÖÜË'), 'OUE'); |
| 82 | is(remove_diacritics('łŁđĐ'), 'lLdD'); |
| 83 | is(remove_diacritics('ţŢşŞçÇ'), 'tTsScC'); |
| 84 | is(remove_diacritics('øı'), 'oi'); |
| 85 | |
| 86 | is(remove_diacritics( |
| 87 | q{Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ}), |
| 88 | q{Bonjour ca va? C'est l'ete! Ich mochte a A a a a e e e e E E i I i i i o o o o O U u u u u U y Y c C n N}); |
| 89 | |
| 90 | # https://docs.seneca.nl/Smartsite-Docs/Features-Modules/Add-On_Modules/Faceted_Search/FS_Reference/FTS_and_iFTS_technical_background_information/Diacritics_and_Unicode.html |
| 91 | is(remove_diacritics( |
| 92 | q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/), |
| 93 | q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/); |
| 94 | |
Akron | 2daf8fe | 2023-02-27 12:55:04 +0100 | [diff] [blame] | 95 | |
| 96 | # Create emoji path relative to test file |
| 97 | my $emoji_file = catdir(dirname(__FILE__), 'real','all_emojis.txt'); |
| 98 | |
| 99 | # Init test values |
| 100 | my ($ok, $fail) = (0, 0); |
| 101 | |
| 102 | # Test all emojis line by line |
| 103 | open(in_file,"<:encoding(utf8)",$emoji_file) or die("Could not open emoji file."); |
| 104 | while(<in_file>){ |
| 105 | chomp $_; |
| 106 | if (KorAP::XML::Tokenizer::is_emoji($_)) { |
| 107 | $ok++; |
| 108 | } else { |
| 109 | $fail++; |
| 110 | } |
| 111 | }; |
| 112 | close(in_file); |
| 113 | |
| 114 | # Check emojis for regressions |
| 115 | ok($ok >= 2036, "Emojis fine"); |
| 116 | ok($fail <= 1746, "Emojis fine"); |
| 117 | |
Akron | 55778f0 | 2017-03-14 20:47:26 +0100 | [diff] [blame] | 118 | no utf8; |
| 119 | |
Akron | 1622dd9 | 2015-12-09 22:34:26 +0100 | [diff] [blame] | 120 | done_testing; |
| 121 | __END__ |