blob: 319a02e35d82fabef4e846b4cbcac6fcc6dd1e12 [file] [log] [blame]
Akron1622dd92015-12-09 22:34:26 +01001use strict;
2use warnings;
3use utf8;
4use Test::More;
5use Benchmark ':hireswallclock';
6use lib 'lib', '../lib';
7
Akronafb81ad2016-08-01 20:28:31 +02008use_ok('KorAP::XML::Index::MultiTerm');
Akron1622dd92015-12-09 22:34:26 +01009
Akron129e4412020-08-05 15:30:12 +020010ok(my $term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object');
11$term->set_p_start(0);
12$term->set_p_end(56);
13$term->set_payload('<i>56');
14$term->set_o_start(34);
15$term->set_o_end(120);
Akron1622dd92015-12-09 22:34:26 +010016
Akron72e671f2020-08-04 11:35:40 +020017is($term->get_term, 'Baum');
18is($term->get_p_start, 0);
19is($term->get_p_end, 56);
20is($term->get_o_start, 34);
21is($term->get_o_end, 120);
22is($term->get_payload, '<i>56');
Akron9c0488f2016-01-28 14:17:15 +010023is($term->to_string, 'Baum$<i>34<i>120<i>56<i>56');
Akron1622dd92015-12-09 22:34:26 +010024
Akron129e4412020-08-05 15:30:12 +020025ok($term = KorAP::XML::Index::MultiTerm->new('Baum'), 'Create new object');
Akron1622dd92015-12-09 22:34:26 +010026
Akron72e671f2020-08-04 11:35:40 +020027is($term->get_term, 'Baum');
28is($term->get_p_start, 0);
29is($term->get_p_end, 0);
30is($term->get_o_start, 0);
31is($term->get_o_end, 0);
32is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010033is($term->to_string, 'Baum');
34
Akron129e4412020-08-05 15:30:12 +020035ok($term = KorAP::XML::Index::MultiTerm->new('Ba#um'), 'Create new object');
Akron1622dd92015-12-09 22:34:26 +010036
Akron72e671f2020-08-04 11:35:40 +020037is($term->get_term, 'Ba#um');
38is($term->get_p_start, 0);
39is($term->get_p_end, 0);
40is($term->get_o_start, 0);
41is($term->get_o_end, 0);
42is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010043is($term->to_string, 'Ba\#um');
44
Akron129e4412020-08-05 15:30:12 +020045ok($term = KorAP::XML::Index::MultiTerm->new('Ba#u$m'), 'Create new object');
46$term->set_payload('<i>45');
Akron1622dd92015-12-09 22:34:26 +010047
Akron72e671f2020-08-04 11:35:40 +020048is($term->get_term, 'Ba#u$m');
49is($term->get_p_start, 0);
50is($term->get_p_end, 0);
51is($term->get_o_start, 0);
52is($term->get_o_end, 0);
53is($term->get_payload, '<i>45');
Akron1622dd92015-12-09 22:34:26 +010054is($term->to_string, 'Ba\#u\$m$<i>45');
55
Akron55778f02017-03-14 20:47:26 +010056use_ok('KorAP::XML::Tokenizer');
57
58use utf8;
59sub remove_diacritics { KorAP::XML::Tokenizer::remove_diacritics(@_) };
60
61is(remove_diacritics('äöü'), 'aou', 'Remove diacritics');
62
63is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
64is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
65
66# From comment in http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
67is(remove_diacritics('ÅåÄäÖö'), 'AaAaOo', 'Check swedish');
68# Krawfish::Util::String::_list_props('Łł');
69is(remove_diacritics('ĄąĆćĘꣳŃńÓ󌜏źŻż'), 'AaCcEeLlNnOoSsZzZz', 'Check polish');
70is(remove_diacritics('ľščťžýáíéúäôň*ȍŽÝÁÍÉÚÄÔŇĎ'), 'lsctzyaieuaondLSCTZYAIEUAOND', 'Check slowakish');
71is(remove_diacritics('ëőüűŐÜŰ'), 'eouuOUU', 'Check hungarian');
72is(remove_diacritics('Ññ¿'), 'Nn¿', 'Check spanish');
73is(remove_diacritics('àèòçï'), 'aeoci', 'Check CA?');
74is(remove_diacritics('ı'), 'i', 'Check turkish');
75
76# From http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
77is(remove_diacritics('äáčďěéíľľňôóřŕšťúůýž'), 'aacdeeillnoorrstuuyz');
78is(remove_diacritics('ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ'), 'AACDEEILLNOORRSTUUYZ');
79is(remove_diacritics('ÖÜË'), 'OUE');
80is(remove_diacritics('łŁđĐ'), 'lLdD');
81is(remove_diacritics('ţŢşŞçÇ'), 'tTsScC');
82is(remove_diacritics('øı'), 'oi');
83
84is(remove_diacritics(
85 q{Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ}),
86 q{Bonjour ca va? C'est l'ete! Ich mochte a A a a a e e e e E E i I i i i o o o o O U u u u u U y Y c C n N});
87
88# https://docs.seneca.nl/Smartsite-Docs/Features-Modules/Add-On_Modules/Faceted_Search/FS_Reference/FTS_and_iFTS_technical_background_information/Diacritics_and_Unicode.html
89is(remove_diacritics(
90 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/),
91 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/);
92
93no utf8;
94
Akron1622dd92015-12-09 22:34:26 +010095done_testing;
96__END__