| #!/usr/bin/env node |
| |
| const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/; |
| const hashtagRegex = /^#[a-zA-Z0-9]+/; |
| const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/; |
| const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/; |
| const addressRegex = /^@[a-zA-Z0-9]+/; |
| const actionWordRegex = /^:[^:]+:$/; |
| const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/; |
| |
| const optionDefinitions = [ |
| { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'}, |
| { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'}, |
| ] |
| |
| const sections = [ |
| { |
| header: 'conllu-cmc', |
| content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.' |
| }, |
| { |
| header: 'Synopsis', |
| content: '$ conllu-cmc [-s] < input.conllu > output.conllu' |
| }, |
| { |
| header: 'Options', |
| optionList: optionDefinitions |
| } |
| ] |
| |
| const getUsage = require('command-line-usage') |
| const commandLineArgs = require('command-line-args') |
| |
| var options; |
| try { |
| options = commandLineArgs(optionDefinitions) |
| } catch (e) { |
| console.error(e.message); |
| options = { help: true }; |
| } |
| |
| if (options.help) { |
| const usage = getUsage(sections); |
| console.log(usage); |
| process.exit(1); |
| } |
| |
| const EmojiRegex = require('emoji-regex'); |
| const emojiRegex = EmojiRegex(); |
| const readline = require('readline'); |
| global.header = ''; |
| global.standalone = false |
| |
| |
| const rl = readline.createInterface({ |
| input: process.stdin, |
| output: process.stdout, |
| terminal: false, |
| |
| }); |
| |
| |
| function parseConllu(line) { |
| if (line.match('#\\s*foundry')) { |
| if (line.match('=\\s*base')) { |
| if (options.sparse) { |
| global.standalone = true |
| } |
| process.stdout.write("# foundry = cmc\n"); |
| } else { |
| process.stdout.write(`${line}\n`); |
| } |
| return |
| } |
| |
| if (global.standalone) { |
| if (line.match('^#\\s*filename')) { |
| global.fileheader = `${line}\n`; |
| return; |
| } else if (line.match('^#\\s*text_id')){ |
| global.fileheader += `${line}\n`; |
| return; |
| } else if (line.match('^#\\s*eo[ft]')){ |
| process.stdout.write(`${line}\n`); |
| return; |
| } else if (line.match('^#')){ |
| global.header += `${line}\n`; |
| return; |
| } else if (line.trim().match('^$')) { |
| if(global.header == "") { |
| process.stdout.write("\n"); |
| } |
| global.header = ''; |
| return |
| } |
| } else { |
| if (! line.match('^\\d+')) { |
| process.stdout.write(`${line}\n`); |
| return; |
| } |
| } |
| |
| const columns = line.trim().split('\t'); |
| |
| const word = columns[1]; |
| var new_tag = null; |
| if (word.match(wikiEmojiRegex)) { |
| new_tag = 'EMOWIKI'; |
| } else if (word.match(emojiRegex)) { |
| new_tag = 'EMOIMG'; |
| } else if(word.match(actionWordRegex)) { |
| new_tag = 'AKW'; |
| } else if(word.match(emoticonRegex)) { |
| new_tag = 'EMOASC'; |
| } else if(word.match(hashtagRegex)) { |
| } else if(word.match(urlRegex)) { |
| new_tag = 'URL'; |
| } else if(word.match(emailRegex)) { |
| new_tag = 'EML'; |
| } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) { |
| new_tag = 'ADR'; |
| } |
| if (new_tag) { |
| columns[3] = columns[4] = new_tag; |
| columns[5] = '_'; |
| if (global.standalone) { |
| process.stdout.write(fileheader); |
| process.stdout.write(header); |
| new_tag = null; |
| header = fileheader = ''; |
| } |
| process.stdout.write(columns.join('\t') + '\n'); |
| } else if (! global.standalone) { |
| process.stdout.write(`${line}\n`); |
| } |
| } |
| |
| rl.on('line', parseConllu); |
| rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang |