Omission words cannot start with white space
Fixes bug that interprets a single quotation mark at the beginning of a word
not as token, but as beginning of an omission.
Change-Id: I97ca48755eeecdee1029e90da4df72121d05c688
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 1bc4599..c3abeb6 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -534,7 +534,7 @@
EMOTICON = ( [<>]?[BX;8:=][o\-\']?[DdPp()\/3>oO*]+|<\/?3+|ಠ_ಠ|\(-.-\)|\(T_T\)|\(♥_♥\)|\)\':|\)-:|\(-:|\)=|\)o:|\)x|:\'C|:\/|:<|:C|:[|=\(|=\)|=D|=P|>:|D\':|D:|\:|]:|x\(|\^\^|o.O|oO|\\{o}\/|\\m\/|:;\)\)|_\)\)|\*_\*|._.|:wink:|>_<|\*<:-\)|[:;]\)|[;;]" "\))
LC_CONSONANT = [bcdfgjklmnpqrstvwxs]
-OMISSIONWORD = ({p}resqu'île|{a}ujourd\'hui|{q}uelqu\'une?|[^\P{L}Qq]{LETTER}?[^dcjlmnstDCJLNMST][\'`]|{LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|!(!({LETTER}+[\'`]{LC_CONSONANT})|{INIT_CLITIC})){LETTER}*
+OMISSIONWORD = ({p}resqu'île|{a}ujourd\'hui|{q}uelqu\'une?|[^\P{L}Qq]{LETTER}?[^dcjlmnstDCJLNMST{WHITESPACE}][\'`]|{LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|!(!({LETTER}+[\'`]{LC_CONSONANT})|{INIT_CLITIC})){LETTER}*
EXTENSION = (html?|doc|docx?|pptx?|xlsx?|pdf|jpe?g|mp[34]|ogg|png|avi|txt|xml|aac|HTML?|DOCX?|PPTX?|XLSX?|GIF|JPE?G|TXT)
FNAME = ({LETTER}:[\\/]{LETTER})?({LETTER}|[\\_/-])+\.{EXTENSION}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 1f2e63e..d98d768 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -581,6 +581,16 @@
}
@Test
+ public void wordsCannotStartWithOmissions () {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+ String[] tokens = tok.tokenize("'Ddorf 'Kudamm");
+ assertEquals("'", tokens[0]);
+ assertEquals("Ddorf", tokens[1]);
+ assertEquals("'", tokens[2]);
+ assertEquals("Kudamm", tokens[3]);
+ }
+
+ @Test
public void germanTokenizerDoesNOTSeparateGermanContractions () {
DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
String[] tokens = tok.tokenize("mach's macht's was'n ist's haste willste kannste biste kriegste");