blob: 3bd2d5dd7a374528910b557fa7dcd570015894db [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
4const hashtagRegex = /^#[a-zA-Z0-9]+/;
5const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +01008const actionWordRegex = /^:[^:]+:$/;
Marc Kupietz7497fc42025-12-11 15:47:34 +01009const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +010010
Marc Kupietz30634ff2025-12-18 11:39:03 +010011// Load emoji data
12let emojiData = {};
13try {
14 emojiData = require('./emoji_data.json');
15} catch (e) {
16 // Silent fallback if file doesn't exist (e.g. during initial setup before script run)
17}
18
Marc Kupietza7934e02025-12-18 07:25:53 +010019// Function to strip emoji modifiers and zero-width joiners to get base emoji
20function getBaseEmoji(emoji) {
21 const stripped = emoji
22 // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
23 .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
24 // Remove zero-width joiners (U+200D)
25 .replace(/\u200D/g, '')
26 // Remove variation selectors (U+FE0F, U+FE0E)
27 .replace(/[\uFE0E\uFE0F]/g, '');
28
29 // Extract the first emoji character using Array spread to handle multi-byte emoji
30 const chars = [...stripped];
31 return chars.length > 0 ? chars[0] : stripped;
32}
33
Marc Kupietzb43a5182024-02-03 18:09:10 +010034const optionDefinitions = [
35 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
36 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
37]
38
39const sections = [
40 {
Marc Kupietz03ba3012025-12-11 16:14:05 +010041 header: 'conllu-cmc',
Marc Kupietzb43a5182024-02-03 18:09:10 +010042 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
43 },
44 {
45 header: 'Synopsis',
Marc Kupietz03ba3012025-12-11 16:14:05 +010046 content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
Marc Kupietzb43a5182024-02-03 18:09:10 +010047 },
48 {
49 header: 'Options',
50 optionList: optionDefinitions
51 }
52]
53
54const getUsage = require('command-line-usage')
55const commandLineArgs = require('command-line-args')
56
57var options;
58try {
59 options = commandLineArgs(optionDefinitions)
60} catch (e) {
61 console.error(e.message);
62 options = { help: true };
63}
64
65if (options.help) {
66 const usage = getUsage(sections);
67 console.log(usage);
Marc Kupietzb75d9262025-12-11 16:31:57 +010068 process.exit(0);
Marc Kupietzb43a5182024-02-03 18:09:10 +010069}
70
71const EmojiRegex = require('emoji-regex');
72const emojiRegex = EmojiRegex();
73const readline = require('readline');
74global.header = '';
75global.standalone = false
76
77
78const rl = readline.createInterface({
79 input: process.stdin,
80 output: process.stdout,
81 terminal: false,
82
83});
84
85
86function parseConllu(line) {
87 if (line.match('#\\s*foundry')) {
88 if (line.match('=\\s*base')) {
89 if (options.sparse) {
90 global.standalone = true
91 }
92 process.stdout.write("# foundry = cmc\n");
93 } else {
94 process.stdout.write(`${line}\n`);
95 }
96 return
97 }
98
99 if (global.standalone) {
100 if (line.match('^#\\s*filename')) {
101 global.fileheader = `${line}\n`;
102 return;
103 } else if (line.match('^#\\s*text_id')){
104 global.fileheader += `${line}\n`;
105 return;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +0100106 } else if (line.match('^#\\s*eo[ft]')){
107 process.stdout.write(`${line}\n`);
108 return;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100109 } else if (line.match('^#')){
110 global.header += `${line}\n`;
111 return;
112 } else if (line.trim().match('^$')) {
113 if(global.header == "") {
114 process.stdout.write("\n");
115 }
116 global.header = '';
117 return
118 }
119 } else {
120 if (! line.match('^\\d+')) {
121 process.stdout.write(`${line}\n`);
122 return;
123 }
124 }
125
126 const columns = line.trim().split('\t');
127
128 const word = columns[1];
Marc Kupietza7934e02025-12-18 07:25:53 +0100129 // Guard clause: if word is undefined, just output the line as-is
130 if (!word) {
131 process.stdout.write(`${line}\n`);
132 return;
133 }
134
Marc Kupietzb43a5182024-02-03 18:09:10 +0100135 var new_tag = null;
Marc Kupietz7497fc42025-12-11 15:47:34 +0100136 if (word.match(wikiEmojiRegex)) {
137 new_tag = 'EMOWIKI';
138 } else if (word.match(emojiRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100139 new_tag = 'EMOIMG';
140 } else if(word.match(actionWordRegex)) {
141 new_tag = 'AKW';
142 } else if(word.match(emoticonRegex)) {
143 new_tag = 'EMOASC';
144 } else if(word.match(hashtagRegex)) {
145 } else if(word.match(urlRegex)) {
146 new_tag = 'URL';
147 } else if(word.match(emailRegex)) {
148 new_tag = 'EML';
149 } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
150 new_tag = 'ADR';
151 }
152 if (new_tag) {
Marc Kupietz76007d62025-12-11 17:13:05 +0100153 columns[4] = new_tag;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100154 columns[5] = '_';
Marc Kupietza7934e02025-12-18 07:25:53 +0100155 // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
156 if (new_tag === 'EMOIMG') {
Marc Kupietz30634ff2025-12-18 11:39:03 +0100157 const base = getBaseEmoji(word);
158 columns[2] = base;
Marc Kupietza7934e02025-12-18 07:25:53 +0100159 columns[3] = 'EMOIMG';
Marc Kupietz30634ff2025-12-18 11:39:03 +0100160
161 // Look up emoji metadata
162 // Try exact match first, then base emoji
163 const data = emojiData[word] || emojiData[base];
164 if (data) {
165 // g=group|s=subgroup|q=qualified|v=version|n=name
166 columns[5] = `g=${data.g}|s=${data.s}|q=${data.q}|v=${data.v}|n=${data.n}`;
167 }
Marc Kupietza7934e02025-12-18 07:25:53 +0100168 }
Marc Kupietzb43a5182024-02-03 18:09:10 +0100169 if (global.standalone) {
170 process.stdout.write(fileheader);
171 process.stdout.write(header);
172 new_tag = null;
173 header = fileheader = '';
174 }
175 process.stdout.write(columns.join('\t') + '\n');
176 } else if (! global.standalone) {
177 process.stdout.write(`${line}\n`);
178 }
179}
180
181rl.on('line', parseConllu);
182rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang