Add unmodified emojis as lemma annotation
Change-Id: Ibea3dc4d74571fe13dd969894694a59a5bfcf918
diff --git a/src/index.js b/src/index.js
index 78d3c9b..7f2b2ce 100755
--- a/src/index.js
+++ b/src/index.js
@@ -8,6 +8,21 @@
const actionWordRegex = /^:[^:]+:$/;
const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
+// Function to strip emoji modifiers and zero-width joiners to get base emoji
+function getBaseEmoji(emoji) {
+ const stripped = emoji
+ // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
+ .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
+ // Remove zero-width joiners (U+200D)
+ .replace(/\u200D/g, '')
+ // Remove variation selectors (U+FE0F, U+FE0E)
+ .replace(/[\uFE0E\uFE0F]/g, '');
+
+ // Extract the first emoji character using Array spread to handle multi-byte emoji
+ const chars = [...stripped];
+ return chars.length > 0 ? chars[0] : stripped;
+}
+
const optionDefinitions = [
{ name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
{ name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
@@ -103,6 +118,12 @@
const columns = line.trim().split('\t');
const word = columns[1];
+ // Guard clause: if word is undefined, just output the line as-is
+ if (!word) {
+ process.stdout.write(`${line}\n`);
+ return;
+ }
+
var new_tag = null;
if (word.match(wikiEmojiRegex)) {
new_tag = 'EMOWIKI';
@@ -123,6 +144,11 @@
if (new_tag) {
columns[4] = new_tag;
columns[5] = '_';
+ // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
+ if (new_tag === 'EMOIMG') {
+ columns[2] = getBaseEmoji(word);
+ columns[3] = 'EMOIMG';
+ }
if (global.standalone) {
process.stdout.write(fileheader);
process.stdout.write(header);