blob: 7f2b2ced8ee2047a1639042570aaaee39136977d [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
4const hashtagRegex = /^#[a-zA-Z0-9]+/;
5const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +01008const actionWordRegex = /^:[^:]+:$/;
Marc Kupietz7497fc42025-12-11 15:47:34 +01009const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +010010
Marc Kupietza7934e02025-12-18 07:25:53 +010011// Function to strip emoji modifiers and zero-width joiners to get base emoji
12function getBaseEmoji(emoji) {
13 const stripped = emoji
14 // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
15 .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
16 // Remove zero-width joiners (U+200D)
17 .replace(/\u200D/g, '')
18 // Remove variation selectors (U+FE0F, U+FE0E)
19 .replace(/[\uFE0E\uFE0F]/g, '');
20
21 // Extract the first emoji character using Array spread to handle multi-byte emoji
22 const chars = [...stripped];
23 return chars.length > 0 ? chars[0] : stripped;
24}
25
Marc Kupietzb43a5182024-02-03 18:09:10 +010026const optionDefinitions = [
27 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
28 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
29]
30
31const sections = [
32 {
Marc Kupietz03ba3012025-12-11 16:14:05 +010033 header: 'conllu-cmc',
Marc Kupietzb43a5182024-02-03 18:09:10 +010034 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
35 },
36 {
37 header: 'Synopsis',
Marc Kupietz03ba3012025-12-11 16:14:05 +010038 content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
Marc Kupietzb43a5182024-02-03 18:09:10 +010039 },
40 {
41 header: 'Options',
42 optionList: optionDefinitions
43 }
44]
45
46const getUsage = require('command-line-usage')
47const commandLineArgs = require('command-line-args')
48
49var options;
50try {
51 options = commandLineArgs(optionDefinitions)
52} catch (e) {
53 console.error(e.message);
54 options = { help: true };
55}
56
57if (options.help) {
58 const usage = getUsage(sections);
59 console.log(usage);
Marc Kupietzb75d9262025-12-11 16:31:57 +010060 process.exit(0);
Marc Kupietzb43a5182024-02-03 18:09:10 +010061}
62
63const EmojiRegex = require('emoji-regex');
64const emojiRegex = EmojiRegex();
65const readline = require('readline');
66global.header = '';
67global.standalone = false
68
69
70const rl = readline.createInterface({
71 input: process.stdin,
72 output: process.stdout,
73 terminal: false,
74
75});
76
77
78function parseConllu(line) {
79 if (line.match('#\\s*foundry')) {
80 if (line.match('=\\s*base')) {
81 if (options.sparse) {
82 global.standalone = true
83 }
84 process.stdout.write("# foundry = cmc\n");
85 } else {
86 process.stdout.write(`${line}\n`);
87 }
88 return
89 }
90
91 if (global.standalone) {
92 if (line.match('^#\\s*filename')) {
93 global.fileheader = `${line}\n`;
94 return;
95 } else if (line.match('^#\\s*text_id')){
96 global.fileheader += `${line}\n`;
97 return;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010098 } else if (line.match('^#\\s*eo[ft]')){
99 process.stdout.write(`${line}\n`);
100 return;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100101 } else if (line.match('^#')){
102 global.header += `${line}\n`;
103 return;
104 } else if (line.trim().match('^$')) {
105 if(global.header == "") {
106 process.stdout.write("\n");
107 }
108 global.header = '';
109 return
110 }
111 } else {
112 if (! line.match('^\\d+')) {
113 process.stdout.write(`${line}\n`);
114 return;
115 }
116 }
117
118 const columns = line.trim().split('\t');
119
120 const word = columns[1];
Marc Kupietza7934e02025-12-18 07:25:53 +0100121 // Guard clause: if word is undefined, just output the line as-is
122 if (!word) {
123 process.stdout.write(`${line}\n`);
124 return;
125 }
126
Marc Kupietzb43a5182024-02-03 18:09:10 +0100127 var new_tag = null;
Marc Kupietz7497fc42025-12-11 15:47:34 +0100128 if (word.match(wikiEmojiRegex)) {
129 new_tag = 'EMOWIKI';
130 } else if (word.match(emojiRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100131 new_tag = 'EMOIMG';
132 } else if(word.match(actionWordRegex)) {
133 new_tag = 'AKW';
134 } else if(word.match(emoticonRegex)) {
135 new_tag = 'EMOASC';
136 } else if(word.match(hashtagRegex)) {
137 } else if(word.match(urlRegex)) {
138 new_tag = 'URL';
139 } else if(word.match(emailRegex)) {
140 new_tag = 'EML';
141 } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
142 new_tag = 'ADR';
143 }
144 if (new_tag) {
Marc Kupietz76007d62025-12-11 17:13:05 +0100145 columns[4] = new_tag;
Marc Kupietzb43a5182024-02-03 18:09:10 +0100146 columns[5] = '_';
Marc Kupietza7934e02025-12-18 07:25:53 +0100147 // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
148 if (new_tag === 'EMOIMG') {
149 columns[2] = getBaseEmoji(word);
150 columns[3] = 'EMOIMG';
151 }
Marc Kupietzb43a5182024-02-03 18:09:10 +0100152 if (global.standalone) {
153 process.stdout.write(fileheader);
154 process.stdout.write(header);
155 new_tag = null;
156 header = fileheader = '';
157 }
158 process.stdout.write(columns.join('\t') + '\n');
159 } else if (! global.standalone) {
160 process.stdout.write(`${line}\n`);
161 }
162}
163
164rl.on('line', parseConllu);
165rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang