package de.ids_mannheim.korapxmltools

import WorkerPool
import org.w3c.dom.Document
import org.w3c.dom.Element
import org.w3c.dom.NodeList
import org.xml.sax.InputSource
import picocli.CommandLine
import picocli.CommandLine.*
import java.io.File
import java.io.InputStream
import java.io.InputStreamReader
import java.util.*
import java.util.concurrent.Callable
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.logging.Level
import java.util.logging.Logger
import java.util.stream.IntStream
import java.util.zip.ZipFile
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory
import kotlin.math.min
import kotlin.system.exitProcess

@Command(
    name = "KorapXml2Conllu",
    mixinStandardHelpOptions = true,
    version = ["KorapXml2Conllu 2.0-alpha-01"],
    description = ["Converts KorAP-XML <https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or " +
            "morpho zips to CoNLL(-U) format with all information necessary for " +
            "reconstruction in comment lines."]
)

class KorapXml2Conllu : Callable<Int> {
    val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null

    @Parameters(arity = "1..*", description = ["At least one zip file name"])
    var zipFileNames: Array<String>? = null

    @Option(
        names = ["--sigle-pattern", "-p"],
        paramLabel = "PATTERN",
        description = ["Extract only documents with sigle matching the pattern (regex)"]
    )
    var siglePattern: String? = null

    @Option(
        names = ["--extract-attributes-regex", "-e"],
        paramLabel = "REGEX",
        description = ["Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.",
            "Example: -e '(posting/id|div/id)'"]
    )
    var extractAttributesRegex: String = ""

    @Option(
        names = ["--s-bounds-from-morpho"], description = ["Not yet implemented: s bounds from morpho"]
    )
    var sBoundsFromMorpho: Boolean = false

    @Option(
        names = ["--log", "-l"],
        paramLabel = "LEVEL",
        description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"]
    )
    var logLevel: String = "WARNING"

    @Option(
        names = ["--columns", "-c"],
        paramLabel = "NUMBER",
        description = ["Number of columns. 1 means just the token. Default: ${"$"}{DEFAULT-VALUE}", "Possible values: 1-10"]
    )
    var columns: Int = 10

    @Option(
        names = ["--word2vec", "-w"],
        description = ["Print text in LM training format: tokens separated by space, sentences separated by newline"]
    )
    var lmTrainingData: Boolean = false

    @Option(
        names = ["--token-separator", "-s"],
        paramLabel = "SEPARATOR",
        description = ["Not yet implemented: token separator"]
    )
    var tokenSeparator: String = "\n"

    @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
    var offsets: Boolean = false

    @Option(names = ["--comments", "-C"], description = ["Not yet implemented: comments"])
    var comments: Boolean = false

    @Option(
        names = ["--extract-metadata-regex", "-m"],
        paramLabel = "REGEX",
        description = ["Extract metadata regexes.\nExample: -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)'"]
    )
    var extractMetadataRegex: MutableList<String> = mutableListOf()

    @Option(
        names = ["--annotate-with", "-A"],
        paramLabel = "COMMAND",
        description = ["Pipe output through command"]
    )
    var annotateWith: String = ""

    @Option(
        names = ["--threads", "-T"],
        paramLabel = "THREADS",
        description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"]
    )
    var threads: Int = Runtime.getRuntime().availableProcessors()

    override fun call(): Int {
        LOGGER.level = try {
            Level.parse(logLevel.uppercase(Locale.getDefault()))
        } catch (e: IllegalArgumentException) {
            LOGGER.warning("Invalid log level: $logLevel. Defaulting to WARNING.")
            Level.WARNING
        }

        LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", "))

        korapxml2conllu(zipFileNames!!)
        return 0
    }

    private val LOGGER: Logger = Logger.getLogger(KorapXml2Conllu::class.java.name)

    private var workerPool : WorkerPool? = null

    val texts: ConcurrentHashMap<String, String> = ConcurrentHashMap()
    val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
    val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap()
    val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
    val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
    val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
    val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
    var waitForMorpho: Boolean = false
    fun korapxml2conllu(args: Array<String>) {
        val executor: ExecutorService = Executors.newFixedThreadPool(threads)

        if (annotateWith != "") {
            workerPool = WorkerPool(annotateWith, threads, LOGGER)
        }

        var zips: Array<String> = args
        if (args.size == 1 && args[0].matches(Regex(".*\\.([^/.]+)\\.zip$"))) {
            val baseZip = args[0].replace(Regex("\\.([^/.]+)\\.zip$"), ".zip")
            if (File(baseZip).exists()) {
                zips = arrayOf(baseZip, zips[0])
                LOGGER.info("Processing base zip file: $baseZip")
            }
        }
        waitForMorpho = zips.size > 1
        Arrays.stream(zips).forEach { zipFilePath ->
            executor.submit {
                processZipFile(
                    (zipFilePath ?: "").toString(),
                    getFoundryFromZipFileNames(zips)
                )
            }
        }

        executor.shutdown()
        while (!executor.isTerminated) {
            // Wait for all tasks to finish
        }
        texts.keys.sorted().parallelStream().forEach { docId ->
            if (!tokens.containsKey(docId)) {
                tokens[docId] = getTokenSpansFromMorho(morpho[docId]!!)
            }
            processText(
                docId,
                getFoundryFromZipFileName(fnames[docId]!!),
                true
            )
        }
        if (annotateWith.isNotEmpty()) {
            LOGGER.info("closing worker pool")
            workerPool?.close()
        }
    }


    private fun getTokenSpansFromMorho(morpho: MutableMap<String, MorphoSpan>): Array<Span> {
        return morpho.keys.map { key ->
            val fromTo = key.split("-")
            Span(fromTo[0].toInt(), fromTo[1].toInt())
        }.sortedBy {
            it.from
        }.toTypedArray()
    }

    private fun getFoundryFromZipFileName(zipFileName: String): String {
        if (!zipFileName.matches(Regex(".*\\.([^/.]+)\\.zip$"))) {
            return "base"
        }
        return zipFileName.replace(Regex(".*\\.([^/.]+)\\.zip$"), "$1")
    }

    private fun getFoundryFromZipFileNames(zipFileNames: Array<String>): String {
        for (zipFileName in zipFileNames) {
            val foundry = getFoundryFromZipFileName(zipFileName)
            if (foundry != "base") {
                return foundry
            }
        }
        return "base"
    }

    private fun processZipFile(
        zipFilePath: String,
        foundry: String = "base",

    ) {
        try {
            ZipFile(zipFilePath).use { zipFile ->
                zipFile.stream().parallel().forEach { zipEntry ->
                    try {
                        if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
                            val inputStream: InputStream = zipFile.getInputStream(zipEntry)
                            val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
                            val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
                            val doc: Document = dBuilder.parse(InputSource(InputStreamReader(inputStream, "UTF-8")))

                            doc.documentElement.normalize()
                            val docId: String = doc.documentElement.getAttribute("docid")
                            if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) {
                                return@forEach
                            }
                            // LOGGER.info("Processing file: " + zipEntry.getName())
                            val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
                            when (fileName) {
                                "data.xml" -> {
                                    val textsList: NodeList = doc.getElementsByTagName("text")
                                    if (textsList.length > 0) {
                                        texts[docId] = textsList.item(0).textContent
                                    }
                                }

                                "structure.xml" -> {
                                    val spans: NodeList = doc.getElementsByTagName("span")
                                    if (extractAttributesRegex.isNotEmpty())
                                        extraFeatures[docId] = extractMiscSpans(spans)
                                    sentences[docId] = extractSentenceSpans(spans)

                                }

                                "tokens.xml" -> {
                                    if (!fnames.contains(docId)) {
                                        fnames[docId] = zipEntry.name
                                    }
                                    val tokenSpans: NodeList = doc.getElementsByTagName("span")
                                    tokens[docId] = extractSpans(tokenSpans)
                                }

                                "morpho.xml" -> {
                                    waitForMorpho = true
                                    fnames[docId] = zipEntry.name
                                    val fsSpans: NodeList = doc.getElementsByTagName("span")
                                    morpho[docId] = extractMorphoSpans(fsSpans)
                                        tokens[docId] = extractSpans(fsSpans)
                                }
                            }

                            if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
                                && (!waitForMorpho || morpho[docId] != null)
                                && (extractMetadataRegex.isEmpty() || metadata.containsKey(docId))
                                ) {
                                processText(docId, foundry, waitForMorpho)

                            }
                        } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
                            //LOGGER.info("Processing header file: " + zipEntry.name)
                            val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }
                            val docId =
                                Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
                                    ?.replace(Regex("/"), "_")
                            LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
                            val meta = ArrayList<String>()
                            extractMetadataRegex.forEach { regex ->
                                val match = Regex(regex).find(text)
                                if (match != null) {
                                    meta.add(match.destructured.component1())
                                }
                            }
                            if (meta.isNotEmpty() && docId != null) {
                                metadata[docId] = meta.toTypedArray()
                            }
                        }
                    } catch (e: Exception) {
                        e.printStackTrace()
                    }
                }
            }
        } catch (e: Exception) {
            e.printStackTrace()
        }
    }

    private fun processText(
        docId: String,
        foundry: String,
        waitForMorpho: Boolean,
    ) {
        var token_index = 0
        var real_token_index = 0
        var sentence_index = 0
        val output: StringBuilder
        if (lmTrainingData) {
            output = StringBuilder()
            if (extractMetadataRegex.isNotEmpty()) {
                output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
            }
            tokens[docId]?.forEach { span ->
                token_index++
                if (span.from >= sentences[docId]!![sentence_index].to) {
                    if(output.length > 0) {
                        output.setCharAt(output.length - 1, '\n')
                    } else {
                        output.append("\n")
                    }
                    if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) {
                        output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
                    }
                    sentence_index++
                }
                output.append(texts[docId]!!.substring(span.from, span.to), " ")
                real_token_index++
            }
            if(output.length > 0) {
                output.deleteCharAt(output.length - 1)
            }
        } else {
            output =
                StringBuilder("# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n").append(
                    tokenOffsetsInSentence(
                        sentences, docId, sentence_index, real_token_index, tokens
                    )
                )
            if (extractMetadataRegex.isNotEmpty()) {
                output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
            }
            var previousSpanStart = 0
            tokens[docId]?.forEach { span ->
                token_index++
                if (span.from >= sentences[docId]!![sentence_index].to) {
                    output.append("\n")
                    sentence_index++
                    token_index = 1
                    output.append(
                        tokenOffsetsInSentence(
                            sentences, docId, sentence_index, real_token_index, tokens
                        )
                    )
                }
                if (extractAttributesRegex.isNotEmpty() && extraFeatures[docId] != null) {
                    for (i in previousSpanStart until span.from+1) {
                        if (extraFeatures[docId]?.containsKey("$i") == true) {
                            output.append(extraFeatures[docId]!!["$i"])
                            extraFeatures[docId]!!.remove("$i")
                        }
                    }
                    previousSpanStart = span.from+1
                }
                if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
                    val mfs = morpho[docId]!!["${span.from}-${span.to}"]

                    output.append(
                        printConlluToken(
                            token_index,
                            texts[docId]!!.substring(span.from, span.to),
                            mfs!!.lemma!!,
                            mfs.upos!!,
                            mfs.xpos!!,
                            mfs.feats!!,
                            mfs.head!!,
                            mfs.deprel!!,
                            mfs.deps!!,
                            mfs.misc!!,
                            columns
                        )
                    )
                } else {
                    output.append(
                        printConlluToken(
                            token_index, texts[docId]!!.substring(span.from, span.to), columns = columns
                        )
                    )
                }
                real_token_index++
            }
        }

        if (annotateWith != "") {
            workerPool?.pushToQueue(output.append("\n# eot\n").toString())
        } else {
            synchronized(System.out) {
                println(output.toString())
            }
        }

        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
            map.remove(docId)
        }
    }


    private fun printConlluToken(
        token_index: Int,
        token: String,
        lemma: String = "_",
        upos: String = "_",
        xpos: String = "_",
        feats: String = "_",
        head: String = "_",
        deprel: String = "_",
        deps: String = "_",
        misc: String = "_",
        columns: Int = 10
    ): String {
        val myUpos = if (COMPATIBILITY_MODE && upos == "_") xpos else upos
        when (columns) {
            1 -> return ("$token\n")
            10 -> return ("$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc\n")
            else -> return arrayOf(token_index, token, lemma, myUpos, xpos, feats, head, deprel, deps, misc).slice(0..min(columns, 10) - 1)
                .joinToString("\t") + "\n"
        }
    }

    private fun tokenOffsetsInSentence(
        sentences: ConcurrentHashMap<String, Array<Span>>,
        docId: String,
        sentence_index: Int,
        token_index: Int,
        tokens: ConcurrentHashMap<String, Array<Span>>
    ): String {
        if (sentences[docId] == null || sentences[docId]!!.size <= sentence_index) {
            return ""
        }
        val sentenceEndOffset = sentences[docId]!![sentence_index].to
        var i = token_index
        val start_offsets_string = StringBuilder()
        val end_offsets_string = StringBuilder()
        while (tokens[docId] != null && i < tokens[docId]!!.size && tokens[docId]!![i].to <= sentenceEndOffset) {
            start_offsets_string.append(" ", tokens[docId]!![i].from)
            end_offsets_string.append(" ", tokens[docId]!![i].to)
            i++
        }
        return (
                StringBuilder() .append(
                    "# start_offsets = ", tokens[docId]!![token_index].from, start_offsets_string, "\n",
                    "# end_offsets = ", sentenceEndOffset, end_offsets_string, "\n"
                ).toString())
    }

    private fun extractSpans(spans: NodeList): Array<Span> {
        return IntStream.range(0, spans.length).mapToObj(spans::item).filter { node -> node is Element }.map { node ->
                Span(
                    Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to"))
                )
            }.toArray { size -> arrayOfNulls(size) }
    }

    private fun extractMorphoSpans(
        fsSpans: NodeList
    ): MutableMap<String, MorphoSpan> {
        val UNKNOWN = Regex("(UNKNOWN|<unknown>)")
        val res: MutableMap<String, MorphoSpan> = HashMap()
        IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -> node is Element && node.getAttribute("type") != "alt" }.forEach { node ->
                val features = (node as Element).getElementsByTagName("f")
                val fs = MorphoSpan()
                val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}"
                IntStream.range(0, features.length).mapToObj(features::item).forEach { feature ->
                        val attr = (feature as Element).getAttribute("name")
                        val value = feature.textContent.trim()
                        if (value.isEmpty()) return@forEach
                        when (attr) {
                            "lemma" -> if(fs.lemma == "_") fs.lemma = value.replace(UNKNOWN, "--")
                            "upos" -> fs.upos = value
                            "xpos", "ctag", "pos" -> if(fs.xpos == "_") fs.xpos = value.replace(UNKNOWN, "--")
                            "feats", "msd" -> if(fs.feats == "_" ) fs.feats = value
                            "type" -> if(fs.feats == "_") fs.feats = feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent.trim()
                            // "subtype" -> if(fs.feats == "_") fs.feats += ":" + feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent
                            "certainty" -> if(fs.misc == "_") fs.misc = value
                        }
                    }
                res[fromTo] = fs
            }
        return res
    }

    private fun extractSentenceSpans(spans: NodeList): Array<Span> {
        return IntStream.range(0, spans.length).mapToObj(spans::item)
            .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") }
            .map { node ->
                Span(
                    Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to"))
                )
            }.toArray { size -> arrayOfNulls(size) }
    }

    /*
     <span id="s15" from="370" to="394" l="5">
      <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0">
        <f name="name">posting</f>
        <f name="attr">
          <fs type="attr">
            <f name="id">i.10894_1_3</f>
            <f name="indentLevel">0</f>
            <f name="who">WU00000000</f>
          </fs>
        </f>
      </fs>
    </span>

     */
    private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> {
        val miscLocal: MutableMap<String, String> = HashMap()

        IntStream.range(0, spans.length).mapToObj(spans::item)
            .filter { node ->
                node is Element
                        && node.getElementsByTagName("f").length > 1
                        && (node.getElementsByTagName("f").item(0) as Element).getAttribute("name").equals("name")
                        && (node.getElementsByTagName("f").item(1) as Element).getAttribute("name").equals("attr")
            }
            .forEach { node ->
                if (node == null) return@forEach
                val elementName = (node as Element).getElementsByTagName("f").item(0).textContent.trim()
                val from = node.getAttribute("from")
                val attributes = (node.getElementsByTagName("f").item(1) as Element).getElementsByTagName("f")
                val res = StringBuilder()
                IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr ->
                    val attrName = "$elementName/${(attr as Element).getAttribute("name")}"
                    if (attrName.matches(Regex(extractAttributesRegex))) {
                         res.append("# $attrName = ${attr.textContent}\n")
                        //LOGGER.info("" + from + ": $attrName = " + attr.textContent)
                    }

                }
                if (res.isNotEmpty()) {
                    if (miscLocal.containsKey(from)) {
                        // LOGGER.info("ADDING TO $from: ${miscLocal[from]}")
                        miscLocal[from] += res.toString()
                    } else {
                        miscLocal[from] = res.toString()
                    }
                }
            }
        return miscLocal
    }


    class Span(var from: Int, var to: Int)

    class MorphoSpan(
        var lemma: String? = "_",
        var upos: String? = "_",
        var xpos: String? = "_",
        var feats: String? = "_",
        var head: String? = "_",
        var deprel: String? = "_",
        var deps: String? = "_",
        var misc: String? = "_"
    )

}

fun main(args: Array<String>): Unit = exitProcess(CommandLine(KorapXml2Conllu()).execute(*args))

fun debug(args: Array<String>): Int {
    return (CommandLine(KorapXml2Conllu()).execute(*args))
}
