Add Wikipedia emoji template support
Resolves #114
Change-Id: I55d4e8d18d1a290b4f918cabbe19ed757c630037
diff --git a/CHANGELOG.md b/CHANGELOG.md
index baa09ba..64bfb78 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,7 @@
* Fixed genderstern and omission asterisk breaking after hyphens (issue #115)
* Added emoji complex support (issue #116)
+* Added Wikipedia emoji template support (issue #114)
## 2.2.5
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index b58aa41..785a215 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -573,8 +573,8 @@
WORD = ({IRISH_O}?{ALPHANUM}+|[Qq]ur{Q}an)
-// pragmas used for anonymization etc.
-PRAGMA = \[_[A-Z\-]+_\]
+// pragmas used for anonymization etc., optionally with content parameter (issue #114)
+PRAGMA = \[_[A-Z\-]+(:[^\[\]]+)?_\]
%include language-specific_/*$target.language$*/.jflex-macro
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 402bf85..411f147 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -373,6 +373,28 @@
assertEquals(5, tokens.length);
}
+ // Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/114
+ @Test
+ public void testTokenizerWikipediaEmojiTemplate () {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+
+ // Test Wikipedia emoji template from the issue
+ String[] tokens = tok.tokenize("Ein Smiley [_EMOJI:{{S|;)}}_] hier");
+ assertEquals("Ein", tokens[0]);
+ assertEquals("Smiley", tokens[1]);
+ assertEquals("[_EMOJI:{{S|;)}}_]", tokens[2]); // Should be one token
+ assertEquals("hier", tokens[3]);
+ assertEquals(4, tokens.length);
+
+ // Test simple pragma still works
+ tokens = tok.tokenize("Name: [_ANONYMIZED_] Ende");
+ assertEquals("Name", tokens[0]);
+ assertEquals(":", tokens[1]);
+ assertEquals("[_ANONYMIZED_]", tokens[2]); // Should be one token
+ assertEquals("Ende", tokens[3]);
+ assertEquals(4, tokens.length);
+ }
+
@Test
// Probably interpreted as HOST
public void testTokenizerFileExtension1 () {