blob: 1417f4df6cbc1881caa5f766dcff0f565e71ba03 [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
Marc Kupietz20f33d92026-04-12 14:27:57 +02003const packageVersion = require('../package.json').version;
4
Marc Kupietzb43a5182024-02-03 18:09:10 +01005const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
Marc Kupietz3d525092026-04-11 20:44:32 +02006const hashtagRegex = /^#(?=.*\p{L})[\p{L}\p{M}\p{N}]+$/u;
Marc Kupietzb43a5182024-02-03 18:09:10 +01007const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
8const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
9const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +010010const actionWordRegex = /^:[^:]+:$/;
Marc Kupietz7497fc42025-12-11 15:47:34 +010011const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +010012
Marc Kupietz30634ff2025-12-18 11:39:03 +010013// Load emoji data
14let emojiData = {};
15try {
16 emojiData = require('./emoji_data.json');
17} catch (e) {
18 // Silent fallback if file doesn't exist (e.g. during initial setup before script run)
19}
20
Marc Kupietza7934e02025-12-18 07:25:53 +010021// Function to strip emoji modifiers and zero-width joiners to get base emoji
22function getBaseEmoji(emoji) {
23 const stripped = emoji
24 // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
25 .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
26 // Remove zero-width joiners (U+200D)
27 .replace(/\u200D/g, '')
28 // Remove variation selectors (U+FE0F, U+FE0E)
29 .replace(/[\uFE0E\uFE0F]/g, '');
Marc Kupietza17c2e52026-04-10 14:21:27 +020030
Marc Kupietza7934e02025-12-18 07:25:53 +010031 // Extract the first emoji character using Array spread to handle multi-byte emoji
32 const chars = [...stripped];
33 return chars.length > 0 ? chars[0] : stripped;
34}
35
Marc Kupietzb43a5182024-02-03 18:09:10 +010036const optionDefinitions = [
Marc Kupietza17c2e52026-04-10 14:21:27 +020037 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.' },
Marc Kupietz20f33d92026-04-12 14:27:57 +020038 { name: 'version', alias: 'V', type: Boolean, description: 'Print the current package version.' },
Marc Kupietza17c2e52026-04-10 14:21:27 +020039 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.' },
Marc Kupietzb43a5182024-02-03 18:09:10 +010040]
41
42const sections = [
43 {
Marc Kupietz03ba3012025-12-11 16:14:05 +010044 header: 'conllu-cmc',
Marc Kupietzb43a5182024-02-03 18:09:10 +010045 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
46 },
47 {
48 header: 'Synopsis',
Marc Kupietz202a36d2026-04-12 14:58:13 +020049 content: '$ conllu-cmc [-s] < input.conllu > output.conllu\n$ cmc-tagger [-s] < input.conllu > output.conllu\n$ conllu-cmc -V\n$ cmc-tagger -V'
Marc Kupietzb43a5182024-02-03 18:09:10 +010050 },
51 {
52 header: 'Options',
53 optionList: optionDefinitions
54 }
55]
56
57const getUsage = require('command-line-usage')
58const commandLineArgs = require('command-line-args')
59
60var options;
61try {
62 options = commandLineArgs(optionDefinitions)
63} catch (e) {
64 console.error(e.message);
65 options = { help: true };
66}
67
68if (options.help) {
69 const usage = getUsage(sections);
70 console.log(usage);
Marc Kupietzb75d9262025-12-11 16:31:57 +010071 process.exit(0);
Marc Kupietzb43a5182024-02-03 18:09:10 +010072}
73
Marc Kupietz20f33d92026-04-12 14:27:57 +020074if (options.version) {
75 console.log(packageVersion);
76 process.exit(0);
77}
78
Marc Kupietzb43a5182024-02-03 18:09:10 +010079const EmojiRegex = require('emoji-regex');
80const emojiRegex = EmojiRegex();
Marc Kupietzbbc9b222026-04-10 15:43:01 +020081const { once } = require('events');
Marc Kupietzb43a5182024-02-03 18:09:10 +010082const readline = require('readline');
83global.header = '';
Marc Kupietzbbc9b222026-04-10 15:43:01 +020084global.fileheader = '';
Marc Kupietzb43a5182024-02-03 18:09:10 +010085global.standalone = false
86
87
88const rl = readline.createInterface({
89 input: process.stdin,
Marc Kupietzb43a5182024-02-03 18:09:10 +010090 terminal: false,
Marc Kupietza17c2e52026-04-10 14:21:27 +020091
Marc Kupietzb43a5182024-02-03 18:09:10 +010092});
93
94
Marc Kupietzbbc9b222026-04-10 15:43:01 +020095async function writeOutput(text) {
96 if (!process.stdout.write(text)) {
97 await once(process.stdout, 'drain');
98 }
99}
100
101
102async function parseConllu(line) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100103 if (line.match('#\\s*foundry')) {
104 if (line.match('=\\s*base')) {
105 if (options.sparse) {
106 global.standalone = true
107 }
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200108 await writeOutput("# foundry = cmc\n");
Marc Kupietzb43a5182024-02-03 18:09:10 +0100109 } else {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200110 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100111 }
112 return
113 }
114
115 if (global.standalone) {
116 if (line.match('^#\\s*filename')) {
117 global.fileheader = `${line}\n`;
118 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200119 } else if (line.match('^#\\s*text_id')) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100120 global.fileheader += `${line}\n`;
121 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200122 } else if (line.match('^#\\s*eo[ft]')) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200123 await writeOutput(`${line}\n`);
Marc Kupietzfd92b1d2024-03-13 10:51:29 +0100124 return;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200125 } else if (line.match('^#')) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100126 global.header += `${line}\n`;
127 return;
128 } else if (line.trim().match('^$')) {
Marc Kupietza17c2e52026-04-10 14:21:27 +0200129 if (global.header == "") {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200130 await writeOutput("\n");
Marc Kupietzb43a5182024-02-03 18:09:10 +0100131 }
132 global.header = '';
133 return
134 }
135 } else {
Marc Kupietza17c2e52026-04-10 14:21:27 +0200136 if (!line.match('^\\d+')) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200137 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100138 return;
139 }
140 }
141
142 const columns = line.trim().split('\t');
143
144 const word = columns[1];
Marc Kupietza7934e02025-12-18 07:25:53 +0100145 // Guard clause: if word is undefined, just output the line as-is
146 if (!word) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200147 await writeOutput(`${line}\n`);
Marc Kupietza7934e02025-12-18 07:25:53 +0100148 return;
149 }
Marc Kupietza17c2e52026-04-10 14:21:27 +0200150
Marc Kupietzb43a5182024-02-03 18:09:10 +0100151 var new_tag = null;
Marc Kupietz7497fc42025-12-11 15:47:34 +0100152 if (word.match(wikiEmojiRegex)) {
153 new_tag = 'EMOWIKI';
154 } else if (word.match(emojiRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100155 new_tag = 'EMOIMG';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200156 } else if (word.match(actionWordRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100157 new_tag = 'AKW';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200158 } else if (word.match(emoticonRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100159 new_tag = 'EMOASC';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200160 } else if (word.match(hashtagRegex)) {
161 new_tag = 'HST';
162 } else if (word.match(urlRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100163 new_tag = 'URL';
Marc Kupietza17c2e52026-04-10 14:21:27 +0200164 } else if (word.match(emailRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100165 new_tag = 'EML';
Marc Kupietz804750d2026-04-10 14:44:13 +0200166 } else if (word.match(addressRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100167 new_tag = 'ADR';
168 }
169 if (new_tag) {
Marc Kupietz76007d62025-12-11 17:13:05 +0100170 columns[4] = new_tag;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100171 columns[5] = '_';
Marc Kupietza7934e02025-12-18 07:25:53 +0100172 // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
173 if (new_tag === 'EMOIMG') {
Marc Kupietz30634ff2025-12-18 11:39:03 +0100174 const base = getBaseEmoji(word);
175 columns[2] = base;
Marc Kupietza17c2e52026-04-10 14:21:27 +0200176
Marc Kupietz30634ff2025-12-18 11:39:03 +0100177 // Look up emoji metadata
178 // Try exact match first, then base emoji
179 const data = emojiData[word] || emojiData[base];
180 if (data) {
181 // g=group|s=subgroup|q=qualified|v=version|n=name
182 columns[5] = `g=${data.g}|s=${data.s}|q=${data.q}|v=${data.v}|n=${data.n}`;
183 }
Marc Kupietza7934e02025-12-18 07:25:53 +0100184 }
Marc Kupietzb43a5182024-02-03 18:09:10 +0100185 if (global.standalone) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200186 await writeOutput(fileheader);
187 await writeOutput(header);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100188 new_tag = null;
189 header = fileheader = '';
190 }
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200191 await writeOutput(columns.join('\t') + '\n');
Marc Kupietza17c2e52026-04-10 14:21:27 +0200192 } else if (!global.standalone) {
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200193 await writeOutput(`${line}\n`);
Marc Kupietzb43a5182024-02-03 18:09:10 +0100194 }
195}
196
Marc Kupietzbbc9b222026-04-10 15:43:01 +0200197async function main() {
198 for await (const line of rl) {
199 await parseConllu(line);
200 }
201}
202
203main().catch((error) => {
204 console.error(error);
205 process.exit(1);
206});