blob: 1a368fb5dc58c3610579db368515b83e2d0bf563 [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
4const hashtagRegex = /^#[a-zA-Z0-9]+/;
5const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +01008const actionWordRegex = /^:[^:]+:$/;
Marc Kupietz7497fc42025-12-11 15:47:34 +01009const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +010010
11const optionDefinitions = [
12 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
13 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
14]
15
16const sections = [
17 {
Marc Kupietz03ba3012025-12-11 16:14:05 +010018 header: 'conllu-cmc',
Marc Kupietzb43a5182024-02-03 18:09:10 +010019 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
20 },
21 {
22 header: 'Synopsis',
Marc Kupietz03ba3012025-12-11 16:14:05 +010023 content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
Marc Kupietzb43a5182024-02-03 18:09:10 +010024 },
25 {
26 header: 'Options',
27 optionList: optionDefinitions
28 }
29]
30
31const getUsage = require('command-line-usage')
32const commandLineArgs = require('command-line-args')
33
34var options;
35try {
36 options = commandLineArgs(optionDefinitions)
37} catch (e) {
38 console.error(e.message);
39 options = { help: true };
40}
41
42if (options.help) {
43 const usage = getUsage(sections);
44 console.log(usage);
Marc Kupietzb75d9262025-12-11 16:31:57 +010045 process.exit(0);
Marc Kupietzb43a5182024-02-03 18:09:10 +010046}
47
48const EmojiRegex = require('emoji-regex');
49const emojiRegex = EmojiRegex();
50const readline = require('readline');
51global.header = '';
52global.standalone = false
53
54
55const rl = readline.createInterface({
56 input: process.stdin,
57 output: process.stdout,
58 terminal: false,
59
60});
61
62
63function parseConllu(line) {
64 if (line.match('#\\s*foundry')) {
65 if (line.match('=\\s*base')) {
66 if (options.sparse) {
67 global.standalone = true
68 }
69 process.stdout.write("# foundry = cmc\n");
70 } else {
71 process.stdout.write(`${line}\n`);
72 }
73 return
74 }
75
76 if (global.standalone) {
77 if (line.match('^#\\s*filename')) {
78 global.fileheader = `${line}\n`;
79 return;
80 } else if (line.match('^#\\s*text_id')){
81 global.fileheader += `${line}\n`;
82 return;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010083 } else if (line.match('^#\\s*eo[ft]')){
84 process.stdout.write(`${line}\n`);
85 return;
Marc Kupietzb43a5182024-02-03 18:09:10 +010086 } else if (line.match('^#')){
87 global.header += `${line}\n`;
88 return;
89 } else if (line.trim().match('^$')) {
90 if(global.header == "") {
91 process.stdout.write("\n");
92 }
93 global.header = '';
94 return
95 }
96 } else {
97 if (! line.match('^\\d+')) {
98 process.stdout.write(`${line}\n`);
99 return;
100 }
101 }
102
103 const columns = line.trim().split('\t');
104
105 const word = columns[1];
106 var new_tag = null;
Marc Kupietz7497fc42025-12-11 15:47:34 +0100107 if (word.match(wikiEmojiRegex)) {
108 new_tag = 'EMOWIKI';
109 } else if (word.match(emojiRegex)) {
Marc Kupietzb43a5182024-02-03 18:09:10 +0100110 new_tag = 'EMOIMG';
111 } else if(word.match(actionWordRegex)) {
112 new_tag = 'AKW';
113 } else if(word.match(emoticonRegex)) {
114 new_tag = 'EMOASC';
115 } else if(word.match(hashtagRegex)) {
116 } else if(word.match(urlRegex)) {
117 new_tag = 'URL';
118 } else if(word.match(emailRegex)) {
119 new_tag = 'EML';
120 } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
121 new_tag = 'ADR';
122 }
123 if (new_tag) {
124 columns[3] = columns[4] = new_tag;
125 columns[5] = '_';
126 if (global.standalone) {
127 process.stdout.write(fileheader);
128 process.stdout.write(header);
129 new_tag = null;
130 header = fileheader = '';
131 }
132 process.stdout.write(columns.join('\t') + '\n');
133 } else if (! global.standalone) {
134 process.stdout.write(`${line}\n`);
135 }
136}
137
138rl.on('line', parseConllu);
139rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang