blob: 3d34cd25e7cdad1158dfe08038af02b3c53033fb [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
Marc Kupietze8e40ec2026-04-10 15:15:59 +02004const hashtagRegex = /^#[a-zA-Z0-9]*[a-zA-Z][a-zA-Z0-9]*$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +01005const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +01008const actionWordRegex = /^:[^:]+:$/;
Marc Kupietz7497fc42025-12-11 15:47:34 +01009const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +010010
Marc Kupietz30634ff2025-12-18 11:39:03 +010011// Load emoji data
12let emojiData = {};
13try {
14 emojiData = require('./emoji_data.json');
15} catch (e) {
16 // Silent fallback if file doesn't exist (e.g. during initial setup before script run)
17}
18
Marc Kupietza7934e02025-12-18 07:25:53 +010019// Function to strip emoji modifiers and zero-width joiners to get base emoji
20function getBaseEmoji(emoji) {
21 const stripped = emoji
22 // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
23 .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
24 // Remove zero-width joiners (U+200D)
25 .replace(/\u200D/g, '')
26 // Remove variation selectors (U+FE0F, U+FE0E)
27 .replace(/[\uFE0E\uFE0F]/g, '');
Marc Kupietza17c2e52026-04-10 14:21:27 +020028
Marc Kupietza7934e02025-12-18 07:25:53 +010029 // Extract the first emoji character using Array spread to handle multi-byte emoji
30 const chars = [...stripped];
31 return chars.length > 0 ? chars[0] : stripped;
32}
33
Marc Kupietzb43a5182024-02-03 18:09:10 +010034const optionDefinitions = [
Marc Kupietza17c2e52026-04-10 14:21:27 +020035 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.' },
36 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.' },
Marc Kupietzb43a5182024-02-03 18:09:10 +010037]
38
39const sections = [
40 {
Marc Kupietz03ba3012025-12-11 16:14:05 +010041 header: 'conllu-cmc',
Marc Kupietzb43a5182024-02-03 18:09:10 +010042 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
43 },
44 {
45 header: 'Synopsis',
Marc Kupietz03ba3012025-12-11 16:14:05 +010046 content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
Marc Kupietzb43a5182024-02-03 18:09:10 +010047 },
48 {
49 header: 'Options',
50 optionList: optionDefinitions
51 }
52]
53
54const getUsage = require('command-line-usage')
55const commandLineArgs = require('command-line-args')
56
57var options;
58try {
59 options = commandLineArgs(optionDefinitions)
60} catch (e) {
61 console.error(e.message);
62 options = { help: true };
63}
64
65if (options.help) {
66 const usage = getUsage(sections);
67 console.log(usage);
Marc Kupietzb75d9262025-12-11 16:31:57 +010068 process.exit(0);
Marc Kupietzb43a5182024-02-03 18:09:10 +010069}
70
71const EmojiRegex = require('emoji-regex');
72const emojiRegex = EmojiRegex();
Marc Kupietzbbc9b222026-04-10 15:43:01 +020073const { once } = require('events');
Marc Kupietzb43a5182024-02-03 18:09:10 +010074const readline = require('readline');
75global.header = '';
Marc Kupietzbbc9b222026-04-10 15:43:01 +020076global.fileheader = '';
Marc Kupietzb43a5182024-02-03 18:09:10 +010077global.standalone = false
78
79
80const rl = readline.createInterface({
81 input: process.stdin,
Marc Kupietzb43a5182024-02-03 18:09:10 +010082 terminal: false,
Marc Kupietza17c2e52026-04-10 14:21:27 +020083
Marc Kupietzb43a5182024-02-03 18:09:10 +010084});
85
86
Marc Kupietzbbc9b222026-04-10 15:43:01 +020087async function writeOutput(text) {
88 if (!process.stdout.write(text)) {
89 await once(process.stdout, 'drain');
90 }
91}
92
93
94async function parseConllu(line) {
Marc Kupietzb43a5182024-02-03 18:09:10 +010095 if (line.match('#\\s*foundry')) {
96 if (line.match('=\\s*base')) {
97 if (options.sparse) {
98 global.standalone = true
99 }
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200100 await writeOutput("# foundry = cmc\n");
Marc Kupietzb43a5182024-02-03 18:09:10 +0100101 } else {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200102 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100103 }
104 return
105 }
106
107 if (global.standalone) {
108 if (line.match('^#\\s*filename')) {
109 global.fileheader = `${line}\n`;
110 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200111 } else if (line.match('^#\\s*text_id')) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100112 global.fileheader += `${line}\n`;
113 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200114 } else if (line.match('^#\\s*eo[ft]')) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200115 await writeOutput(`${line}\n`);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +0100116 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200117 } else if (line.match('^#')) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100118 global.header += `${line}\n`;
119 return;
120 } else if (line.trim().match('^$')) {
Marc Kupietza17c2e52026-04-10 14:21:27 +0200121 if (global.header == "") {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200122 await writeOutput("\n");
Marc Kupietzb43a5182024-02-03 18:09:10 +0100123 }
124 global.header = '';
125 return
126 }
127 } else {
Marc Kupietza17c2e52026-04-10 14:21:27 +0200128 if (!line.match('^\\d+')) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200129 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100130 return;
131 }
132 }
133
134 const columns = line.trim().split('\t');
135
136 const word = columns[1];
Marc Kupietza7934e02025-12-18 07:25:53 +0100137 // Guard clause: if word is undefined, just output the line as-is
138 if (!word) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200139 await writeOutput(`${line}\n`);
Marc Kupietza7934e02025-12-18 07:25:53 +0100140 return;
141 }
Marc Kupietza17c2e52026-04-10 14:21:27 +0200142
Marc Kupietzb43a5182024-02-03 18:09:10 +0100143 var new_tag = null;
Marc Kupietz7497fc42025-12-11 15:47:34 +0100144 if (word.match(wikiEmojiRegex)) {
145 new_tag = 'EMOWIKI';
146 } else if (word.match(emojiRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100147 new_tag = 'EMOIMG';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200148 } else if (word.match(actionWordRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100149 new_tag = 'AKW';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200150 } else if (word.match(emoticonRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100151 new_tag = 'EMOASC';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200152 } else if (word.match(hashtagRegex)) {
153 new_tag = 'HST';
154 } else if (word.match(urlRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100155 new_tag = 'URL';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200156 } else if (word.match(emailRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100157 new_tag = 'EML';
Marc Kupietz804750d2026-04-10 14:44:13 +0200158 } else if (word.match(addressRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100159 new_tag = 'ADR';
160 }
161 if (new_tag) {
Marc Kupietz76007d62025-12-11 17:13:05 +0100162 columns[4] = new_tag;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100163 columns[5] = '_';
Marc Kupietza7934e02025-12-18 07:25:53 +0100164 // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
165 if (new_tag === 'EMOIMG') {
Marc Kupietz30634ff2025-12-18 11:39:03 +0100166 const base = getBaseEmoji(word);
167 columns[2] = base;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200168
Marc Kupietz30634ff2025-12-18 11:39:03 +0100169 // Look up emoji metadata
170 // Try exact match first, then base emoji
171 const data = emojiData[word] || emojiData[base];
172 if (data) {
173 // g=group|s=subgroup|q=qualified|v=version|n=name
174 columns[5] = `g=${data.g}|s=${data.s}|q=${data.q}|v=${data.v}|n=${data.n}`;
175 }
Marc Kupietza7934e02025-12-18 07:25:53 +0100176 }
Marc Kupietzb43a5182024-02-03 18:09:10 +0100177 if (global.standalone) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200178 await writeOutput(fileheader);
179 await writeOutput(header);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100180 new_tag = null;
181 header = fileheader = '';
182 }
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200183 await writeOutput(columns.join('\t') + '\n');
Marc Kupietza17c2e52026-04-10 14:21:27 +0200184 } else if (!global.standalone) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200185 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100186 }
187}
188
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200189async function main() {
190 for await (const line of rl) {
191 await parseConllu(line);
192 }
193}
194
195main().catch((error) => {
196 console.error(error);
197 process.exit(1);
198});