blob: ab05072f2077c609e8ceaaa937dcf5944be23463 [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
4const hashtagRegex = /^#[a-zA-Z0-9]+/;
5const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
Marc Kupietz73f17492024-02-26 09:44:53 +01008const actionWordRegex = /^:[^:]+:$/;
Marc Kupietzb43a5182024-02-03 18:09:10 +01009
10const optionDefinitions = [
11 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
12 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
13]
14
15const sections = [
16 {
17 header: 'conllu2cmc',
18 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
19 },
20 {
21 header: 'Synopsis',
22 content: '$ conllu2cmc [-s] < input.conllu > output.conllu'
23 },
24 {
25 header: 'Options',
26 optionList: optionDefinitions
27 }
28]
29
30const getUsage = require('command-line-usage')
31const commandLineArgs = require('command-line-args')
32
33var options;
34try {
35 options = commandLineArgs(optionDefinitions)
36} catch (e) {
37 console.error(e.message);
38 options = { help: true };
39}
40
41if (options.help) {
42 const usage = getUsage(sections);
43 console.log(usage);
44 process.exit(1);
45}
46
47const EmojiRegex = require('emoji-regex');
48const emojiRegex = EmojiRegex();
49const readline = require('readline');
50global.header = '';
51global.standalone = false
52
53
54const rl = readline.createInterface({
55 input: process.stdin,
56 output: process.stdout,
57 terminal: false,
58
59});
60
61
62function parseConllu(line) {
63 if (line.match('#\\s*foundry')) {
64 if (line.match('=\\s*base')) {
65 if (options.sparse) {
66 global.standalone = true
67 }
68 process.stdout.write("# foundry = cmc\n");
69 } else {
70 process.stdout.write(`${line}\n`);
71 }
72 return
73 }
74
75 if (global.standalone) {
76 if (line.match('^#\\s*filename')) {
77 global.fileheader = `${line}\n`;
78 return;
79 } else if (line.match('^#\\s*text_id')){
80 global.fileheader += `${line}\n`;
81 return;
Marc Kupietzfd92b1d2024-03-13 10:51:29 +010082 } else if (line.match('^#\\s*eo[ft]')){
83 process.stdout.write(`${line}\n`);
84 return;
Marc Kupietzb43a5182024-02-03 18:09:10 +010085 } else if (line.match('^#')){
86 global.header += `${line}\n`;
87 return;
88 } else if (line.trim().match('^$')) {
89 if(global.header == "") {
90 process.stdout.write("\n");
91 }
92 global.header = '';
93 return
94 }
95 } else {
96 if (! line.match('^\\d+')) {
97 process.stdout.write(`${line}\n`);
98 return;
99 }
100 }
101
102 const columns = line.trim().split('\t');
103
104 const word = columns[1];
105 var new_tag = null;
106 if (word.match(emojiRegex)) {
107 new_tag = 'EMOIMG';
108 } else if(word.match(actionWordRegex)) {
109 new_tag = 'AKW';
110 } else if(word.match(emoticonRegex)) {
111 new_tag = 'EMOASC';
112 } else if(word.match(hashtagRegex)) {
113 } else if(word.match(urlRegex)) {
114 new_tag = 'URL';
115 } else if(word.match(emailRegex)) {
116 new_tag = 'EML';
117 } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
118 new_tag = 'ADR';
119 }
120 if (new_tag) {
121 columns[3] = columns[4] = new_tag;
122 columns[5] = '_';
123 if (global.standalone) {
124 process.stdout.write(fileheader);
125 process.stdout.write(header);
126 new_tag = null;
127 header = fileheader = '';
128 }
129 process.stdout.write(columns.join('\t') + '\n');
130 } else if (! global.standalone) {
131 process.stdout.write(`${line}\n`);
132 }
133}
134
135rl.on('line', parseConllu);
136rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang