Add support for Wikipedia Emoji Templates: EMOWIKI
diff --git a/Readme.md b/Readme.md
index 7f59850..5c7a9f4 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,6 +1,6 @@
# conllu2cmc
-Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, action words, and @names with their corresponding STTS-IBK POS tag (Beißwenger/Bartsch/Evert/Würzner 2016). Writes CoNLL-U format to stdout.
+Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, action words, @names, and Wikipedia emoji templates with their corresponding STTS-IBK POS tag (Beißwenger/Bartsch/Evert/Würzner 2016). Writes CoNLL-U format to stdout.
## Usage
diff --git a/src/index.js b/src/index.js
index ab05072..370a2ba 100755
--- a/src/index.js
+++ b/src/index.js
@@ -6,6 +6,7 @@
const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const addressRegex = /^@[a-zA-Z0-9]+/;
const actionWordRegex = /^:[^:]+:$/;
+const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
const optionDefinitions = [
{ name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
@@ -103,7 +104,9 @@
const word = columns[1];
var new_tag = null;
- if (word.match(emojiRegex)) {
+ if (word.match(wikiEmojiRegex)) {
+ new_tag = 'EMOWIKI';
+ } else if (word.match(emojiRegex)) {
new_tag = 'EMOIMG';
} else if(word.match(actionWordRegex)) {
new_tag = 'AKW';
diff --git a/test/test.js b/test/test.js
index 4f18d56..a6be349 100644
--- a/test/test.js
+++ b/test/test.js
@@ -40,4 +40,26 @@
expect(lines_count).toBe(6202);
done();
});
+ test('Regression test for issue #114: Wikipedia emoji templates', (done) => {
+ // Test that Wikipedia emoji templates are recognized as EMOWIKI tokens
+ const testInput = `# foundry = base
+# text_id = test-114
+# text = [_EMOJI:{{S|;)}}_] and [_EMOJI:{{cool}}_]
+1 [_EMOJI:{{S|;)}}_] _ _ _ _ _ _ _ _
+2 and _ CCONJ _ _ _ _ _ _
+3 [_EMOJI:{{cool}}_] _ _ _ _ _ _ _ _
+
+`;
+ const { execSync } = require('child_process');
+ const stdout = execSync('node src/index.js', { input: testInput }).toString();
+
+ // Check that Wikipedia emoji templates are tagged as EMOWIKI
+ expect(stdout).toContain('[_EMOJI:{{S|;)}}_]\t_\tEMOWIKI\tEMOWIKI');
+ expect(stdout).toContain('[_EMOJI:{{cool}}_]\t_\tEMOWIKI\tEMOWIKI');
+
+ // Count EMOWIKI occurrences (should be 2 for each template - columns 3 and 4)
+ var emowiki_count = (stdout.match(/EMOWIKI/g) || []).length;
+ expect(emowiki_count).toBe(4); // 2 templates × 2 columns = 4
+ done();
+ });
});