Add --parse-with malt:</path/to/model> option

Change-Id: I3d0691241b3b4df0abd3c9f02a1f29d23bdd030f
diff --git a/app/build.gradle b/app/build.gradle
index c42b4ba..7c2b5e1 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -45,6 +45,7 @@
     testImplementation "org.jetbrains.kotlin:kotlin-test:2.0.0-Beta5"
 
     implementation 'com.github.kupietz:cistern:v1.0.2'
+    implementation 'org.maltparser:maltparser:1.9.2'
 }
 
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index 3354211..cfa19e7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -1,24 +1,31 @@
 package de.ids_mannheim.korapxmltools
 
-import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Dispatchers
 import marmot.morph.MorphTagger
 import marmot.morph.Sentence
 import marmot.morph.Word
 import marmot.util.FileUtils
+import org.maltparser.MaltParserService
+import org.maltparser.core.exception.MaltChainedException
+import org.maltparser.core.syntaxgraph.DependencyStructure
 import java.util.logging.Logger
-import kotlin.jvm.Throws
 
-abstract class AnnotationToolBridge {
-    abstract val model: String
-    abstract val logger: Logger
+interface AnnotationToolBridge {
+    val model: String
+    val logger: Logger
 
     @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
-    abstract fun tagSentence(sentenceTokens: MutableList<String>, sentenceTokenOffsets: MutableList<String>, morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?)
+    fun tagSentence(
+        sentenceTokens: MutableList<String>,
+        sentenceTokenOffsets: MutableList<String>,
+        morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+    )
+}
 
-    fun tagText(tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?,  text: String): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
-        val coroutineScope = CoroutineScope(Dispatchers.Default)
+abstract class TaggerToolBridge : AnnotationToolBridge {
 
+    fun tagText(
+        tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
+    ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
         val sentence_tokens = mutableListOf<String>()
         val sentence_token_offsets = mutableListOf<String>()
         val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
@@ -48,22 +55,118 @@
     }
 }
 
+abstract class ParserToolBridge : AnnotationToolBridge {
+    fun parseText(
+        tokens: Array<KorapXml2Conllu.Span>,
+        morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
+        sentenceSpans: Array<KorapXml2Conllu.Span>?,
+        text: String
+    ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+        val sentence_tokens = mutableListOf<String>()
+        val sentence_token_offsets = mutableListOf<String>()
+        var token_index = 1
+        var sentence_index = 0
+        tokens.forEach { span ->
+            if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+                tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+                sentence_tokens.clear()
+                sentence_token_offsets.clear()
+                sentence_index++
+                token_index = 1
+
+            }
+            sentence_tokens.add(
+                "$token_index\t${
+                    text.substring(
+                        span.from, span.to
+                    )
+                }\t_\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${
+                    morpho?.get(
+                        "${span.from}-${span.to}"
+                    )?.feats ?: "_"
+                }\t_\t_\t_\t_"
+            )
+            sentence_token_offsets.add("${span.from}-${span.to}")
+            token_index++
+        }
+        if (sentence_tokens.size > 0) {
+            try {
+                tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+            } catch (e: ArrayIndexOutOfBoundsException) {
+                logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+            }
+        }
+        return morpho!!
+    }
+}
+
 
 class AnnotationToolBridgeFactory {
     companion object {
-        fun getAnnotationToolBridge(taggerName: String, taggerModel: String, LOGGER: Logger): AnnotationToolBridge? {
-            if (taggerName == "marmot") {
-                return MarmotBridge(taggerModel, LOGGER)
-            } else {
-                LOGGER.warning("Unknown tagger $taggerName")
-                return null
+        const val taggerFoundries = "marmot"
+        const val parserFoundries = "malt"
+
+        fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
+            when (foundry) {
+                "marmot" -> return MarmotBridge(model, LOGGER)
+                "malt" -> return MaltParserBridge(model, LOGGER)
+                else -> LOGGER.severe("Unknown tagger $foundry")
+            }
+            return null
+        }
+    }
+}
+
+class MaltParserBridge(override val model: String, override val logger: Logger) : ParserToolBridge() {
+    companion object {
+        fun getFoundry(): String {
+            return "malt"
+        }
+    }
+
+    val tagger: MaltParserService
+
+    init {
+        logger.info("Initializing MaltParser with model $model")
+        tagger = MaltParserService()
+        if (model.contains("/")) {
+            val dirName = model.substringBeforeLast("/")
+            val modelName = model.substringAfterLast("/")
+            logger.info("Loading model $modelName from $dirName")
+            tagger.initializeParserModel("-w $dirName -c $modelName -m parse")
+        } else {
+            tagger.initializeParserModel("-c $model -m parse")
+        }
+        logger.info("Model $model loaded")
+    }
+
+
+    @Throws(MaltChainedException::class)
+    override fun tagSentence(
+        sentenceTokens: MutableList<String>,
+        sentenceTokenOffsets: MutableList<String>,
+        morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+    ) {
+        val result = tagger.parse(sentenceTokens.toTypedArray())
+
+        (result as DependencyStructure).edges.forEach { edge ->
+            val from = edge.source.index
+            val head = edge.target.index
+            val label = edge.toString()
+            if (label.contains("DEPREL:")) {
+                val rel = edge.toString().substringAfter("DEPREL:")
+                val old = morpho?.get(sentenceTokenOffsets[head - 1])
+                morpho?.set(
+                    sentenceTokenOffsets[head - 1], KorapXml2Conllu.MorphoSpan(
+                        lemma = old?.lemma, xpos = old?.xpos, feats = old?.feats, head = from.toString(), deprel = rel
+                    )
+                )
             }
         }
     }
 }
 
-class MarmotBridge(override val model: String, override val logger: Logger) : AnnotationToolBridge() {
-
+class MarmotBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
     val tagger: MorphTagger
 
     init {
@@ -84,8 +187,7 @@
         result = tagger.tag(sentence)  // LOGGER.info("Marmot tagger finished")// return
         for (i in 0 until result.size) {
             val taggedWord = KorapXml2Conllu.MorphoSpan(
-                xpos = result[i][0].split("|")[0],
-                feats = result[i][1]
+                xpos = result[i][0].split("|")[0], feats = result[i][1]
             )
             morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
         }
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 5bf3fc5..a7ad96a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -1,5 +1,7 @@
 package de.ids_mannheim.korapxmltools
 
+import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries
+import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries
 import org.w3c.dom.Document
 import org.w3c.dom.Element
 import org.w3c.dom.NodeList
@@ -23,9 +25,9 @@
 import java.util.regex.Pattern
 import java.util.stream.IntStream
 import java.util.zip.ZipFile
+import javax.swing.text.html.parser.Parser
 import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.DocumentBuilderFactory
-import kotlin.math.max
 import kotlin.math.min
 import kotlin.system.exitProcess
 
@@ -134,16 +136,16 @@
     @Option(
         names = ["--tag-with", "-t"],
         paramLabel = "TAGGER:MODEL",
-        description = ["Specify a tagger and a model: marmot:<path/to/model>."]
+        description = ["Specify a tagger and a model: ${taggerFoundries}:<path/to/model>."]
     )
     fun setTagWith(tagWith: String) {
         if (tagWith != null) {
-            val pattern: Pattern = Pattern.compile("(marmot):(.+)")
+            val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)")
             val matcher: Matcher = pattern.matcher(tagWith)
             if (!matcher.matches()) {
                 throw ParameterException(spec.commandLine(),
                     String.format("Invalid value `%s' for option '--tag-with': "+
-                        "value does not match the expected pattern marmot:<path/to/model>", tagWith))
+                        "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith))
             } else {
                 taggerName = matcher.group(1)
                 taggerModel = matcher.group(2)
@@ -156,6 +158,33 @@
         }
     }
 
+    private var parserName: String? = null
+    private var parserModel: String? = null
+    @Option(
+        names = ["--parse-with", "-P"],
+        paramLabel = "parser:MODEL",
+        description = ["Specify a parser and a model: ${parserFoundries}:<path/to/model>."]
+    )
+    fun setParseWith(parseWith: String) {
+        if (parseWith != null) {
+            val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)")
+            val matcher: Matcher = pattern.matcher(parseWith)
+            if (!matcher.matches()) {
+                throw ParameterException(spec.commandLine(),
+                    String.format("Invalid value `%s' for option '--parse-with': "+
+                            "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith))
+            } else {
+                parserName = matcher.group(1)
+                parserModel = matcher.group(2)
+                if (!File(parserModel).exists()) {
+                    throw ParameterException(spec.commandLine(),
+                        String.format("Invalid value for option '--parse-with':"+
+                                "model file '%s' does not exist", parserModel, parserModel))
+                }
+            }
+        }
+    }
+
 
     override fun call(): Int {
         val handler = ConsoleHandler()
@@ -191,9 +220,10 @@
     val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
     val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
     var waitForMorpho: Boolean = false
-    var annotationToolBridges: ConcurrentHashMap<Long, AnnotationToolBridge?> = ConcurrentHashMap()
+    var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
+    var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
     fun korapxml2conllu(args: Array<String>) {
-        val executor: ExecutorService = Executors.newFixedThreadPool(maxThreads)
+        Executors.newFixedThreadPool(maxThreads)
 
         if (annotateWith.isNotEmpty()) {
             annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)
@@ -274,9 +304,13 @@
 
     fun processZipEntry(zipFile: ZipFile, foundry: String, zipEntry: java.util.zip.ZipEntry) {
         LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
-        if (taggerName != null && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
-            annotationToolBridges[Thread.currentThread().id] =
-                AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER)
+        if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
+            taggerToolBridges[Thread.currentThread().id] =
+                AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+        }
+        if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().id)) {
+            parserToolBridges[Thread.currentThread().id] =
+                AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?
         }
 
         try {
@@ -405,8 +439,11 @@
                 output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
             }
             var previousSpanStart = 0
-            if (annotationToolBridges[Thread.currentThread().id] != null) {
-                morpho[docId] = annotationToolBridges[Thread.currentThread().id]!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+            if (taggerToolBridges[Thread.currentThread().id] != null) {
+                morpho[docId] = taggerToolBridges[Thread.currentThread().id]!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+                if (parserToolBridges[Thread.currentThread().id] != null) {
+                    morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(tokens[docId]!!, morpho[docId], sentences[docId], texts[docId]!!)
+                }
             }
             tokens[docId]?.forEach { span ->
                 token_index++