Add Wikipedia emoji template support

Resolves #114

Change-Id: I55d4e8d18d1a290b4f918cabbe19ed757c630037
diff --git a/CHANGELOG.md b/CHANGELOG.md
index baa09ba..64bfb78 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
 
 * Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
 * Added emoji complex support (issue #116)
+* Added Wikipedia emoji template support (issue #114)
 
 ## 2.2.5
 
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index b58aa41..785a215 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -573,8 +573,8 @@
 
 WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an)
 
-// pragmas used for anonymization etc.
-PRAGMA = \[_[A-Z\-]+_\]
+// pragmas used for anonymization etc., optionally with content parameter (issue #114)
+PRAGMA = \[_[A-Z\-]+(:[^\[\]]+)?_\]
 
 %include language-specific_/*$target.language$*/.jflex-macro
 
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 402bf85..411f147 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -373,6 +373,28 @@
         assertEquals(5, tokens.length);
     }
 
+    // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/114
+    @Test
+    public void testTokenizerWikipediaEmojiTemplate () {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        
+        // Test Wikipedia emoji template from the issue
+        String[] tokens = tok.tokenize("Ein Smiley [_EMOJI:{{S|;)}}_] hier");
+        assertEquals("Ein", tokens[0]);
+        assertEquals("Smiley", tokens[1]);
+        assertEquals("[_EMOJI:{{S|;)}}_]", tokens[2]); // Should be one token
+        assertEquals("hier", tokens[3]);
+        assertEquals(4, tokens.length);
+        
+        // Test simple pragma still works
+        tokens = tok.tokenize("Name: [_ANONYMIZED_] Ende");
+        assertEquals("Name", tokens[0]);
+        assertEquals(":", tokens[1]);
+        assertEquals("[_ANONYMIZED_]", tokens[2]); // Should be one token
+        assertEquals("Ende", tokens[3]);
+        assertEquals(4, tokens.length);
+    }
+
     @Test
     // Probably interpreted as HOST
     public void testTokenizerFileExtension1 () {