blob: 28603a919dc4950cff85b295bcb12426eb4dcaf7 [file] [log] [blame]
Marc Kupietzb43a5182024-02-03 18:09:10 +01001#!/usr/bin/env node
2
3const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
4const hashtagRegex = /^#[a-zA-Z0-9]+/;
5const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
6const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
7const addressRegex = /^@[a-zA-Z0-9]+/;
8const actionWordRegex = /^:.*:$/;
9
10const optionDefinitions = [
11 { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
12 { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
13]
14
15const sections = [
16 {
17 header: 'conllu2cmc',
18 content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
19 },
20 {
21 header: 'Synopsis',
22 content: '$ conllu2cmc [-s] < input.conllu > output.conllu'
23 },
24 {
25 header: 'Options',
26 optionList: optionDefinitions
27 }
28]
29
30const getUsage = require('command-line-usage')
31const commandLineArgs = require('command-line-args')
32
33var options;
34try {
35 options = commandLineArgs(optionDefinitions)
36} catch (e) {
37 console.error(e.message);
38 options = { help: true };
39}
40
41if (options.help) {
42 const usage = getUsage(sections);
43 console.log(usage);
44 process.exit(1);
45}
46
47const EmojiRegex = require('emoji-regex');
48const emojiRegex = EmojiRegex();
49const readline = require('readline');
50global.header = '';
51global.standalone = false
52
53
54const rl = readline.createInterface({
55 input: process.stdin,
56 output: process.stdout,
57 terminal: false,
58
59});
60
61
62function parseConllu(line) {
63 if (line.match('#\\s*foundry')) {
64 if (line.match('=\\s*base')) {
65 if (options.sparse) {
66 global.standalone = true
67 }
68 process.stdout.write("# foundry = cmc\n");
69 } else {
70 process.stdout.write(`${line}\n`);
71 }
72 return
73 }
74
75 if (global.standalone) {
76 if (line.match('^#\\s*filename')) {
77 global.fileheader = `${line}\n`;
78 return;
79 } else if (line.match('^#\\s*text_id')){
80 global.fileheader += `${line}\n`;
81 return;
82 } else if (line.match('^#')){
83 global.header += `${line}\n`;
84 return;
85 } else if (line.trim().match('^$')) {
86 if(global.header == "") {
87 process.stdout.write("\n");
88 }
89 global.header = '';
90 return
91 }
92 } else {
93 if (! line.match('^\\d+')) {
94 process.stdout.write(`${line}\n`);
95 return;
96 }
97 }
98
99 const columns = line.trim().split('\t');
100
101 const word = columns[1];
102 var new_tag = null;
103 if (word.match(emojiRegex)) {
104 new_tag = 'EMOIMG';
105 } else if(word.match(actionWordRegex)) {
106 new_tag = 'AKW';
107 } else if(word.match(emoticonRegex)) {
108 new_tag = 'EMOASC';
109 } else if(word.match(hashtagRegex)) {
110 } else if(word.match(urlRegex)) {
111 new_tag = 'URL';
112 } else if(word.match(emailRegex)) {
113 new_tag = 'EML';
114 } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
115 new_tag = 'ADR';
116 }
117 if (new_tag) {
118 columns[3] = columns[4] = new_tag;
119 columns[5] = '_';
120 if (global.standalone) {
121 process.stdout.write(fileheader);
122 process.stdout.write(header);
123 new_tag = null;
124 header = fileheader = '';
125 }
126 process.stdout.write(columns.join('\t') + '\n');
127 } else if (! global.standalone) {
128 process.stdout.write(`${line}\n`);
129 }
130}
131
132rl.on('line', parseConllu);
133rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang