Omission words cannot start with white space

Fixes bug that interprets a single quotation mark at the beginning of a word
not as token, but as beginning of an omission.

Change-Id: I97ca48755eeecdee1029e90da4df72121d05c688
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 1bc4599..c3abeb6 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -534,7 +534,7 @@
 EMOTICON = ( [<>]?[BX;8:=][o\-\']?[DdPp()\/3>oO*]+|<\/?3+|ಠ_ಠ|\(-.-\)|\(T_T\)|\(♥_♥\)|\)\':|\)-:|\(-:|\)=|\)o:|\)x|:\'C|:\/|:<|:C|:[|=\(|=\)|=D|=P|>:|D\':|D:|\:|]:|x\(|\^\^|o.O|oO|\\{o}\/|\\m\/|:;\)\)|_\)\)|\*_\*|._.|:wink:|>_<|\*<:-\)|[:;]\)|[;;]" "\))
 
 LC_CONSONANT = [bcdfgjklmnpqrstvwxs]
-OMISSIONWORD = ({p}resqu'île|{a}ujourd\'hui|{q}uelqu\'une?|[^\P{L}Qq]{LETTER}?[^dcjlmnstDCJLNMST][\'`]|{LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|!(!({LETTER}+[\'`]{LC_CONSONANT})|{INIT_CLITIC})){LETTER}*
+OMISSIONWORD = ({p}resqu'île|{a}ujourd\'hui|{q}uelqu\'une?|[^\P{L}Qq]{LETTER}?[^dcjlmnstDCJLNMST{WHITESPACE}][\'`]|{LETTER}+\*\*+{LETTER}*|{LETTER}+\*{LETTER}+|!(!({LETTER}+[\'`]{LC_CONSONANT})|{INIT_CLITIC})){LETTER}*
 
 EXTENSION = (html?|doc|docx?|pptx?|xlsx?|pdf|jpe?g|mp[34]|ogg|png|avi|txt|xml|aac|HTML?|DOCX?|PPTX?|XLSX?|GIF|JPE?G|TXT)
 FNAME = ({LETTER}:[\\/]{LETTER})?({LETTER}|[\\_/-])+\.{EXTENSION}
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 1f2e63e..d98d768 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -581,6 +581,16 @@
     }
 
     @Test
+    public void wordsCannotStartWithOmissions () {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        String[] tokens = tok.tokenize("'Ddorf 'Kudamm");
+        assertEquals("'", tokens[0]);
+        assertEquals("Ddorf", tokens[1]);
+        assertEquals("'", tokens[2]);
+        assertEquals("Kudamm", tokens[3]);
+    }
+
+    @Test
     public void germanTokenizerDoesNOTSeparateGermanContractions () {
         DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
         String[] tokens = tok.tokenize("mach's macht's was'n ist's haste willste kannste biste kriegste");