#!/usr/bin/env node

const emoticonRegex = /^(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)/;
const hashtagRegex = /^#[a-zA-Z0-9]+/;
const urlRegex = /^(ftp|http)s?:\/\/[^\s]+/;
const emailRegex = /^\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/;
const addressRegex = /^@[a-zA-Z0-9]+/;
const actionWordRegex = /^:[^:]+:$/;
const wikiEmojiRegex = /^\[_EMOJI:[^\]]+\]$/;

// Function to strip emoji modifiers and zero-width joiners to get base emoji
function getBaseEmoji(emoji) {
  const stripped = emoji
    // Remove skin tone modifiers (U+1F3FB-U+1F3FF)
    .replace(/[\u{1F3FB}-\u{1F3FF}]/gu, '')
    // Remove zero-width joiners (U+200D)
    .replace(/\u200D/g, '')
    // Remove variation selectors (U+FE0F, U+FE0E)
    .replace(/[\uFE0E\uFE0F]/g, '');
  
  // Extract the first emoji character using Array spread to handle multi-byte emoji
  const chars = [...stripped];
  return chars.length > 0 ? chars[0] : stripped;
}

const optionDefinitions = [
    { name: 'sparse', alias: 's', type: Boolean, description: 'Print only the files, lines that have POS annotations.'},
    { name: 'help', alias: 'h', type: Boolean, description: 'Print this usage guide.'},
]

const sections = [
  {
    header: 'conllu-cmc',
    content: 'Reads CoNLL-U format from stdin and annotates emojis, emoticons, hashtags, URLs, email addresses, @addresses, and action words. Writes CoNLL-U format to stdout.'
  },
  {
    header: 'Synopsis',
    content: '$ conllu-cmc [-s] < input.conllu > output.conllu'
  },
  {
    header: 'Options',
    optionList: optionDefinitions
  }
]

const getUsage = require('command-line-usage')
const commandLineArgs = require('command-line-args')

var options;
try {
  options = commandLineArgs(optionDefinitions)
} catch (e) {
  console.error(e.message);
  options = { help: true };
}

if (options.help) {
  const usage = getUsage(sections);
  console.log(usage);
  process.exit(0);
}

const EmojiRegex = require('emoji-regex');
const emojiRegex = EmojiRegex();
const readline = require('readline');
global.header = '';
global.standalone = false


const rl = readline.createInterface({
  input: process.stdin,
  output: process.stdout,
  terminal: false,
  
});


function parseConllu(line) {
  if (line.match('#\\s*foundry')) {
    if (line.match('=\\s*base')) {
      if (options.sparse) {
        global.standalone = true
      }
      process.stdout.write("# foundry = cmc\n");
    } else {
      process.stdout.write(`${line}\n`);
    }
    return
  }

  if (global.standalone) {
    if (line.match('^#\\s*filename')) {
      global.fileheader = `${line}\n`;
      return;
    } else if (line.match('^#\\s*text_id')){
      global.fileheader += `${line}\n`;
      return;
    } else if (line.match('^#\\s*eo[ft]')){
      process.stdout.write(`${line}\n`);
      return;
    } else if (line.match('^#')){
      global.header += `${line}\n`;
      return;
    } else if (line.trim().match('^$')) {
      if(global.header == "") {
        process.stdout.write("\n");
      }
      global.header = '';
      return
    }
  } else {
    if (! line.match('^\\d+')) {
      process.stdout.write(`${line}\n`);
      return;
    }
  }

  const columns = line.trim().split('\t');

  const word = columns[1];
  // Guard clause: if word is undefined, just output the line as-is
  if (!word) {
    process.stdout.write(`${line}\n`);
    return;
  }
  
  var new_tag = null;
  if (word.match(wikiEmojiRegex)) {
    new_tag = 'EMOWIKI';
  } else if (word.match(emojiRegex)) {
    new_tag = 'EMOIMG';
  } else if(word.match(actionWordRegex)) {
    new_tag = 'AKW';
  } else if(word.match(emoticonRegex)) {
    new_tag = 'EMOASC';
  } else if(word.match(hashtagRegex)) {
  } else if(word.match(urlRegex)) {
    new_tag = 'URL';
  } else if(word.match(emailRegex)) {
    new_tag = 'EML';
  } else if(! columns[3].match("^(NE|PROPN)") &&  word.match(addressRegex)) {
    new_tag = 'ADR';
  }
  if (new_tag) {
    columns[4] = new_tag;
    columns[5] = '_';
    // For EMOIMG tokens, set lemma to the base emoji (without modifiers)
    if (new_tag === 'EMOIMG') {
      columns[2] = getBaseEmoji(word);
      columns[3] = 'EMOIMG';
    }
    if (global.standalone) {
      process.stdout.write(fileheader);
      process.stdout.write(header);
      new_tag = null;
      header = fileheader = '';
    }
    process.stdout.write(columns.join('\t') + '\n');
  } else if (! global.standalone) {
    process.stdout.write(`${line}\n`);
  }
}

rl.on('line', parseConllu);
rl.on('close', () => process.exit(0)); // important to exit, otherwise the process will hang
