blob: 9ea5bcd63e0bd4b7dacede5fd0a5d3189ffa8410 [file] [log] [blame]
Akron1622dd92015-12-09 22:34:26 +01001use strict;
2use warnings;
3use utf8;
4use Test::More;
5use Benchmark ':hireswallclock';
6use lib 'lib', '../lib';
7
Akronafb81ad2016-08-01 20:28:31 +02008use_ok('KorAP::XML::Index::MultiTerm');
Akron1622dd92015-12-09 22:34:26 +01009
Akronafb81ad2016-08-01 20:28:31 +020010ok(my $term = KorAP::XML::Index::MultiTerm->new(
Akron1622dd92015-12-09 22:34:26 +010011 term => 'Baum',
12 p_start => 0,
13 p_end => 56,
14 payload => '<i>56',
15 o_start => 34,
16 o_end => 120
17), 'Create new object');
18
Akron72e671f2020-08-04 11:35:40 +020019is($term->get_term, 'Baum');
20is($term->get_p_start, 0);
21is($term->get_p_end, 56);
22is($term->get_o_start, 34);
23is($term->get_o_end, 120);
24is($term->get_payload, '<i>56');
Akron9c0488f2016-01-28 14:17:15 +010025is($term->to_string, 'Baum$<i>34<i>120<i>56<i>56');
Akron1622dd92015-12-09 22:34:26 +010026
Akronafb81ad2016-08-01 20:28:31 +020027ok($term = KorAP::XML::Index::MultiTerm->new(
Akron1622dd92015-12-09 22:34:26 +010028 term => 'Baum'
29), 'Create new object');
30
Akron72e671f2020-08-04 11:35:40 +020031is($term->get_term, 'Baum');
32is($term->get_p_start, 0);
33is($term->get_p_end, 0);
34is($term->get_o_start, 0);
35is($term->get_o_end, 0);
36is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010037is($term->to_string, 'Baum');
38
Akronafb81ad2016-08-01 20:28:31 +020039ok($term = KorAP::XML::Index::MultiTerm->new(
Akron1622dd92015-12-09 22:34:26 +010040 term => 'Ba#um'
41), 'Create new object');
42
Akron72e671f2020-08-04 11:35:40 +020043is($term->get_term, 'Ba#um');
44is($term->get_p_start, 0);
45is($term->get_p_end, 0);
46is($term->get_o_start, 0);
47is($term->get_o_end, 0);
48is($term->get_payload, undef);
Akron1622dd92015-12-09 22:34:26 +010049is($term->to_string, 'Ba\#um');
50
Akronafb81ad2016-08-01 20:28:31 +020051ok($term = KorAP::XML::Index::MultiTerm->new(
Akron1622dd92015-12-09 22:34:26 +010052 term => 'Ba#u$m',
53 payload => '<i>45'
54), 'Create new object');
55
Akron72e671f2020-08-04 11:35:40 +020056is($term->get_term, 'Ba#u$m');
57is($term->get_p_start, 0);
58is($term->get_p_end, 0);
59is($term->get_o_start, 0);
60is($term->get_o_end, 0);
61is($term->get_payload, '<i>45');
Akron1622dd92015-12-09 22:34:26 +010062is($term->to_string, 'Ba\#u\$m$<i>45');
63
Akron55778f02017-03-14 20:47:26 +010064use_ok('KorAP::XML::Tokenizer');
65
66use utf8;
67sub remove_diacritics { KorAP::XML::Tokenizer::remove_diacritics(@_) };
68
69is(remove_diacritics('äöü'), 'aou', 'Remove diacritics');
70
71is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
72is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
73
74# From comment in http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
75is(remove_diacritics('ÅåÄäÖö'), 'AaAaOo', 'Check swedish');
76# Krawfish::Util::String::_list_props('Łł');
77is(remove_diacritics('ĄąĆćĘꣳŃńÓ󌜏źŻż'), 'AaCcEeLlNnOoSsZzZz', 'Check polish');
78is(remove_diacritics('ľščťžýáíéúäôň*ȍŽÝÁÍÉÚÄÔŇĎ'), 'lsctzyaieuaondLSCTZYAIEUAOND', 'Check slowakish');
79is(remove_diacritics('ëőüűŐÜŰ'), 'eouuOUU', 'Check hungarian');
80is(remove_diacritics('Ññ¿'), 'Nn¿', 'Check spanish');
81is(remove_diacritics('àèòçï'), 'aeoci', 'Check CA?');
82is(remove_diacritics('ı'), 'i', 'Check turkish');
83
84# From http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
85is(remove_diacritics('äáčďěéíľľňôóřŕšťúůýž'), 'aacdeeillnoorrstuuyz');
86is(remove_diacritics('ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ'), 'AACDEEILLNOORRSTUUYZ');
87is(remove_diacritics('ÖÜË'), 'OUE');
88is(remove_diacritics('łŁđĐ'), 'lLdD');
89is(remove_diacritics('ţŢşŞçÇ'), 'tTsScC');
90is(remove_diacritics('øı'), 'oi');
91
92is(remove_diacritics(
93 q{Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ}),
94 q{Bonjour ca va? C'est l'ete! Ich mochte a A a a a e e e e E E i I i i i o o o o O U u u u u U y Y c C n N});
95
96# https://docs.seneca.nl/Smartsite-Docs/Features-Modules/Add-On_Modules/Faceted_Search/FS_Reference/FTS_and_iFTS_technical_background_information/Diacritics_and_Unicode.html
97is(remove_diacritics(
98 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/),
99 q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/);
100
101no utf8;
102
Akron1622dd92015-12-09 22:34:26 +0100103done_testing;
104__END__