Initial import
diff --git a/src/index.js b/src/index.js
new file mode 100755
index 0000000..28603a9
--- /dev/null
+++ b/src/index.js
@@ -0,0 +1,133 @@
+#!/usr/bin/env node
+
+const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
+const hashtagRegex = /^#[a-zA-Z0-9]+/;
+const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
+const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
+const addressRegex = /^@[a-zA-Z0-9]+/;
+const actionWordRegex = /^:.*:$/;
+
+const optionDefinitions = [
+ { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
+ { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
+]
+
+const sections = [
+ {
+ header: 'conllu2cmc',
+ content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
+ },
+ {
+ header: 'Synopsis',
+ content: '$ conllu2cmc [-s] < input.conllu > output.conllu'
+ },
+ {
+ header: 'Options',
+ optionList: optionDefinitions
+ }
+]
+
+const getUsage = require('command-line-usage')
+const commandLineArgs = require('command-line-args')
+
+var options;
+try {
+ options = commandLineArgs(optionDefinitions)
+} catch (e) {
+ console.error(e.message);
+ options = { help: true };
+}
+
+if (options.help) {
+ const usage = getUsage(sections);
+ console.log(usage);
+ process.exit(1);
+}
+
+const EmojiRegex = require('emoji-regex');
+const emojiRegex = EmojiRegex();
+const readline = require('readline');
+global.header = '';
+global.standalone = false
+
+
+const rl = readline.createInterface({
+ input: process.stdin,
+ output: process.stdout,
+ terminal: false,
+
+});
+
+
+function parseConllu(line) {
+ if (line.match('#\\s*foundry')) {
+ if (line.match('=\\s*base')) {
+ if (options.sparse) {
+ global.standalone = true
+ }
+ process.stdout.write("# foundry = cmc\n");
+ } else {
+ process.stdout.write(`${line}\n`);
+ }
+ return
+ }
+
+ if (global.standalone) {
+ if (line.match('^#\\s*filename')) {
+ global.fileheader = `${line}\n`;
+ return;
+ } else if (line.match('^#\\s*text_id')){
+ global.fileheader += `${line}\n`;
+ return;
+ } else if (line.match('^#')){
+ global.header += `${line}\n`;
+ return;
+ } else if (line.trim().match('^$')) {
+ if(global.header == "") {
+ process.stdout.write("\n");
+ }
+ global.header = '';
+ return
+ }
+ } else {
+ if (! line.match('^\\d+')) {
+ process.stdout.write(`${line}\n`);
+ return;
+ }
+ }
+
+ const columns = line.trim().split('\t');
+
+ const word = columns[1];
+ var new_tag = null;
+ if (word.match(emojiRegex)) {
+ new_tag = 'EMOIMG';
+ } else if(word.match(actionWordRegex)) {
+ new_tag = 'AKW';
+ } else if(word.match(emoticonRegex)) {
+ new_tag = 'EMOASC';
+ } else if(word.match(hashtagRegex)) {
+ } else if(word.match(urlRegex)) {
+ new_tag = 'URL';
+ } else if(word.match(emailRegex)) {
+ new_tag = 'EML';
+ } else if(! columns[3].match("^(NE|PROPN)") && word.match(addressRegex)) {
+ new_tag = 'ADR';
+ }
+ if (new_tag) {
+ columns[3] = columns[4] = new_tag;
+ columns[5] = '_';
+ if (global.standalone) {
+ process.stdout.write(fileheader);
+ process.stdout.write(header);
+ new_tag = null;
+ header = fileheader = '';
+ }
+ process.stdout.write(columns.join('\t') + '\n');
+ } else if (! global.standalone) {
+ process.stdout.write(`${line}\n`);
+ }
+}
+
+rl.on('line', parseConllu);
+rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang