Added preliminary support for diacritic insensitivity support
Change-Id: I4852a38c68a54159af3553022d2cad3dd0ab0f82
diff --git a/Changes b/Changes
index 2e6ea97..808cc10 100644
--- a/Changes
+++ b/Changes
@@ -10,6 +10,7 @@
- Fixed handling of prefixes for text sigles.
- Support for MarMoT.
- Fix case insensitivity.
+ - Added preliminary support for diacritic insensitivity.
0.24 2016-12-21
- Added --base-sentences and --base-paragraphs options
diff --git a/Makefile.PL b/Makefile.PL
index 966bda5..4473073 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -29,6 +29,7 @@
'Directory::Iterator' => 0,
'Benchmark' => 0,
'Unicode::CaseFold' => 1.00,
+ 'Unicode::Normalize' => 0,
'Carp' => 0,
'strict' => 0,
'warnings' => 0,
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index 4889417..a7b0fc8 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -12,6 +12,7 @@
use KorAP::XML::Tokenizer::Tokens;
use KorAP::XML::Index::MultiTermTokenStream;
use Unicode::CaseFold;
+use Unicode::Normalize qw/getCombinClass normalize/;
use List::MoreUtils 'uniq';
use JSON::XS;
use Log::Log4perl;
@@ -577,6 +578,20 @@
};
+# Remove diacritics from a unicode string
+sub remove_diacritics {
+ use utf8;
+ my $norm = normalize('D',$_[0]);
+
+ # Remove character properties
+ $norm =~ s/\p{InCombiningDiacriticalMarks}//g;
+
+ # Deal with some special cases ...
+ $norm =~ tr/ıŁłđĐÐØø/iLldDDOo/;
+ return normalize('C', $norm);
+}
+
+
1;
@@ -760,4 +775,18 @@
An optional parameter C<skip> allows for skipping the process.
+
+=head1 FUNCTIONS
+
+=head2 remove_diacritics
+
+ # Returns 'aOu'
+ remove_diacritics('äÖü');
+
+Remove diacritic symbols from a string.
+This uses a two step approach: First it normalizes to combination
+characters (Unicode normalization format D), then it deals with a list of
+non-combining arguable diacritics.
+It returns the string in unicode normalization C.
+
=cut
diff --git a/t/tokens.t b/t/tokens.t
index cb7f623..ce083dd 100644
--- a/t/tokens.t
+++ b/t/tokens.t
@@ -61,5 +61,44 @@
is($term->payload, '<i>45');
is($term->to_string, 'Ba\#u\$m$<i>45');
+use_ok('KorAP::XML::Tokenizer');
+
+use utf8;
+sub remove_diacritics { KorAP::XML::Tokenizer::remove_diacritics(@_) };
+
+is(remove_diacritics('äöü'), 'aou', 'Remove diacritics');
+
+is(remove_diacritics('Česká'), 'Ceska', 'Removed diacritics');
+is(remove_diacritics('Äößa'), 'Aoßa', 'Removed diacritics');
+
+# From comment in http://archives.miloush.net/michkap/archive/2007/05/14/2629747.html
+is(remove_diacritics('ÅåÄäÖö'), 'AaAaOo', 'Check swedish');
+# Krawfish::Util::String::_list_props('Łł');
+is(remove_diacritics('ĄąĆćĘꣳŃńÓ󌜏źŻż'), 'AaCcEeLlNnOoSsZzZz', 'Check polish');
+is(remove_diacritics('ľščťžýáíéúäôň*ȍŽÝÁÍÉÚÄÔŇĎ'), 'lsctzyaieuaondLSCTZYAIEUAOND', 'Check slowakish');
+is(remove_diacritics('ëőüűŐÜŰ'), 'eouuOUU', 'Check hungarian');
+is(remove_diacritics('Ññ¿'), 'Nn¿', 'Check spanish');
+is(remove_diacritics('àèòçï'), 'aeoci', 'Check CA?');
+is(remove_diacritics('ı'), 'i', 'Check turkish');
+
+# From http://stackoverflow.com/questions/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net#249126
+is(remove_diacritics('äáčďěéíľľňôóřŕšťúůýž'), 'aacdeeillnoorrstuuyz');
+is(remove_diacritics('ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ'), 'AACDEEILLNOORRSTUUYZ');
+is(remove_diacritics('ÖÜË'), 'OUE');
+is(remove_diacritics('łŁđĐ'), 'lLdD');
+is(remove_diacritics('ţŢşŞçÇ'), 'tTsScC');
+is(remove_diacritics('øı'), 'oi');
+
+is(remove_diacritics(
+ q{Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ}),
+ q{Bonjour ca va? C'est l'ete! Ich mochte a A a a a e e e e E E i I i i i o o o o O U u u u u U y Y c C n N});
+
+# https://docs.seneca.nl/Smartsite-Docs/Features-Modules/Add-On_Modules/Faceted_Search/FS_Reference/FTS_and_iFTS_technical_background_information/Diacritics_and_Unicode.html
+is(remove_diacritics(
+ q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−ÀÁ ÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ/),
+ q/!"#$'()*+,-.0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` abcdefghijklmnoprstuvwxyz{|}~¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿−AAA AAAÆCEEEEIIIIDNOOOOO×OUUUUYÞßaaaaaaæceeeeiiiiðnooooo÷ouuuuyþy/);
+
+no utf8;
+
done_testing;
__END__