blob: cd7f11f1bc665fa05c1aa17f05f545e08196a26c [file] [log] [blame]
#!/usr/bin/env node
const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
const hashtagRegex = /^#[a-zA-Z0-9]+/;
const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const addressRegex = /^@[a-zA-Z0-9]+/;
const actionWordRegex = /^:[^:]+:$/;
const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;
const optionDefinitions = [
{ name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
{ name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
]
const sections = [
{
header: 'conllu-cmc',
content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
},
{
header: 'Synopsis',
content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
},
{
header: 'Options',
optionList: optionDefinitions
}
]
const getUsage = require('command-line-usage')
const commandLineArgs = require('command-line-args')
var options;
try {
options = commandLineArgs(optionDefinitions)
} catch (e) {
console.error(e.message);
options = { help: true };
}
if (options.help) {
const usage = getUsage(sections);
console.log(usage);
process.exit(1);
}
const EmojiRegex = require('emoji-regex');
const emojiRegex = EmojiRegex();
const readline = require('readline');
global.header = '';
global.standalone = false
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout,
terminal: false,
});
function parseConllu(line) {
if (line.match('#\\s*foundry')) {
if (line.match('=\\s*base')) {
if (options.sparse) {
global.standalone = true
}
process.stdout.write("# foundry = cmc\n");
} else {
process.stdout.write(`${line}\n`);
}
return
}
if (global.standalone) {
if (line.match('^#\\s*filename')) {
global.fileheader = `${line}\n`;
return;
} else if (line.match('^#\\s*text_id')){
global.fileheader += `${line}\n`;
return;
} else if (line.match('^#\\s*eo[ft]')){
process.stdout.write(`${line}\n`);
return;
} else if (line.match('^#')){
global.header += `${line}\n`;
return;
} else if (line.trim().match('^$')) {
if(global.header == "") {
process.stdout.write("\n");
}
global.header = '';
return
}
} else {
if (! line.match('^\\d+')) {
process.stdout.write(`${line}\n`);
return;
}
}
const columns = line.trim().split('\t');
const word = columns[1];
var new_tag = null;
if (word.match(wikiEmojiRegex)) {
new_tag = 'EMOWIKI';
} else if (word.match(emojiRegex)) {
new_tag = 'EMOIMG';
} else if(word.match(actionWordRegex)) {
new_tag = 'AKW';
} else if(word.match(emoticonRegex)) {
new_tag = 'EMOASC';
} else if(word.match(hashtagRegex)) {
} else if(word.match(urlRegex)) {
new_tag = 'URL';
} else if(word.match(emailRegex)) {
new_tag = 'EML';
} else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
new_tag = 'ADR';
}
if (new_tag) {
columns[3] = columns[4] = new_tag;
columns[5] = '_';
if (global.standalone) {
process.stdout.write(fileheader);
process.stdout.write(header);
new_tag = null;
header = fileheader = '';
}
process.stdout.write(columns.join('\t') + '\n');
} else if (! global.standalone) {
process.stdout.write(`${line}\n`);
}
}
rl.on('line', parseConllu);
rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang