blob: 780b6caad923737f2c8cc63a5122a0191a8ddaf2 [file] [log] [blame]
Akron1622dd92015-12-09 22:34:26 +01001use strict;
2use warnings;
3use utf8;
4use Test::More;
5use Benchmark ':hireswallclock';
6use lib 'lib', '../lib';
Akron2daf8fe2023-02-27 12:55:04 +01007use File::Basename 'dirname';
8use File::Spec::Functions 'catdir';
Akron1622dd92015-12-09 22:34:26 +01009
Akronafb81ad2016-08-01 20:28:31 +020010use_ok('KorAP::XML::Index::MultiTerm');
Akron1622dd92015-12-09 22:34:26 +010011
Akron129e4412020-08-05 15:30:12 +020012ok(my $term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object');
13$term->set_p_start(0);
14$term->set_p_end(56);
15$term->set_payload('<i>56');
16$term->set_o_start(34);
17$term->set_o_end(120);
Akron1622dd92015-12-09 22:34:26 +010018
Akron72e671f2020-08-04 11:35:40 +020019is($term->get_term, 'Baum');
20is($term->get_p_start, 0);
21is($term->get_p_end, 56);
22is($term->get_o_start, 34);
23is($term->get_o_end, 120);
24is($term->get_payload, '<i>56');
Akron9c0488f2016-01-28 14:17:15 +010025is($term->to_string, 'Baum$<i>34<i>120<i>56<i>56');
Akron1622dd92015-12-09 22:34:26 +010026
Akron129e4412020-08-05 15:30:12 +020027ok($term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object');
Akron1622dd92015-12-09 22:34:26 +010028
Akron72e671f2020-08-04 11:35:40 +020029is($term->get_term, 'Baum');
30is($term->get_p_start, 0);
31is($term->get_p_end, 0);
32is($term->get_o_start, 0);
33is($term->get_o_end, 0);
34is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010035is($term->to_string, 'Baum');
36
Akron129e4412020-08-05 15:30:12 +020037ok($term = KorAP::XML::Index::MultiTerm->new('Ba#um'), 'Create new object');
Akron1622dd92015-12-09 22:34:26 +010038
Akron72e671f2020-08-04 11:35:40 +020039is($term->get_term, 'Ba#um');
40is($term->get_p_start, 0);
41is($term->get_p_end, 0);
42is($term->get_o_start, 0);
43is($term->get_o_end, 0);
44is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010045is($term->to_string, 'Ba\#um');
46
Akron129e4412020-08-05 15:30:12 +020047ok($term = KorAP::XML::Index::MultiTerm->new('Ba#u$m'), 'Create new object');
48$term->set_payload('<i>45');
Akron1622dd92015-12-09 22:34:26 +010049
Akron72e671f2020-08-04 11:35:40 +020050is($term->get_term, 'Ba#u$m');
51is($term->get_p_start, 0);
52is($term->get_p_end, 0);
53is($term->get_o_start, 0);
54is($term->get_o_end, 0);
55is($term->get_payload, '<i>45');
Akron1622dd92015-12-09 22:34:26 +010056is($term->to_string, 'Ba\#u\$m$<i>45');
57
Akron55778f02017-03-14 20:47:26 +010058use_ok('KorAP::XML::Tokenizer');
59
60use utf8;
61sub remove_diacritics { KorAP::XML::Tokenizer::remove_diacritics(@_) };
62
63is(remove_diacritics('äöü'), 'aou', 'Remove diacritics');
64
65is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
66is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
67
68# From comment in http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
69is(remove_diacritics('ÅåÄäÖö'), 'AaAaOo', 'Check swedish');
70# Krawfish::Util::String::_list_props('Łł');
71is(remove_diacritics('ĄąĆćĘꣳŃńÓ󌜏źŻż'), 'AaCcEeLlNnOoSsZzZz', 'Check polish');
72is(remove_diacritics('ľščťžýáíéúäôň*ȍŽÝÁÍÉÚÄÔŇĎ'), 'lsctzyaieuaondLSCTZYAIEUAOND', 'Check slowakish');
73is(remove_diacritics('ëőüűŐÜŰ'), 'eouuOUU', 'Check hungarian');
74is(remove_diacritics('Ññ¿'), 'Nn¿', 'Check spanish');
75is(remove_diacritics('àèòçï'), 'aeoci', 'Check CA?');
76is(remove_diacritics('ı'), 'i', 'Check turkish');
77
78# From http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
79is(remove_diacritics('äáčďěéíľľňôóřŕšťúůýž'), 'aacdeeillnoorrstuuyz');
80is(remove_diacritics('ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ'), 'AACDEEILLNOORRSTUUYZ');
81is(remove_diacritics('ÖÜË'), 'OUE');
82is(remove_diacritics('łŁđĐ'), 'lLdD');
83is(remove_diacritics('ţŢşŞçÇ'), 'tTsScC');
84is(remove_diacritics('øı'), 'oi');
85
86is(remove_diacritics(
87 q{Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ}),
88 q{Bonjour ca va? C'est l'ete! Ich mochte a A a a a e e e e E E i I i i i o o o o O U u u u u U y Y c C n N});
89
90# https://docs.seneca.nl/Smartsite-Docs/Features-Modules/Add-On_Modules/Faceted_Search/FS_Reference/FTS_and_iFTS_technical_background_information/Diacritics_and_Unicode.html
91is(remove_diacritics(
92 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/),
93 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/);
94
Akron2daf8fe2023-02-27 12:55:04 +010095
96# Create emoji path relative to test file
97my $emoji_file = catdir(dirname(__FILE__), 'real','all_emojis.txt');
98
99# Init test values
100my ($ok, $fail) = (0, 0);
101
102# Test all emojis line by line
103open(in_file,"<:encoding(utf8)",$emoji_file) or die("Could not open emoji file.");
104while(<in_file>){
105 chomp $_;
106 if (KorAP::XML::Tokenizer::is_emoji($_)) {
107 $ok++;
108 } else {
109 $fail++;
110 }
111};
112close(in_file);
113
114# Check emojis for regressions
115ok($ok >= 2036, "Emojis fine");
116ok($fail <= 1746, "Emojis fine");
117
Akron55778f02017-03-14 20:47:26 +0100118no utf8;
119
Akron1622dd92015-12-09 22:34:26 +0100120done_testing;
121__END__