Provide a direct bridge to java based annotation tools

Change-Id: Ie8c1a66310229f454da1e45b50ecc689d07790cf
diff --git a/app/build.gradle b/app/build.gradle
index 7337882..31cfdc9 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -44,6 +44,9 @@
     // Use the Kotlin JUnit integration.
     testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'
     testImplementation "org.jetbrains.kotlin:kotlin-test:2.0.0-Beta4"
+
+    implementation fileTree(dir: '../libs', include: ['*.jar'])
+    //implementation file('libs/marmot.jar')
 }
 
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
new file mode 100644
index 0000000..6c83e08
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -0,0 +1,89 @@
+package de.ids_mannheim.korapxmltools
+
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import marmot.morph.MorphTagger
+import marmot.morph.Sentence
+import marmot.morph.Word
+import marmot.util.FileUtils
+import java.util.logging.Logger
+import kotlin.jvm.Throws
+
+abstract class AnnotationToolBridge {
+    abstract val logger: Logger
+
+    @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+    abstract fun tagSentence(sentenceTokens: MutableList<String>, sentenceTokenOffsets: MutableList<String>, morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?)
+
+    fun tagText(tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?,  text: String): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+        val coroutineScope = CoroutineScope(Dispatchers.Default)
+
+        val sentence_tokens = mutableListOf<String>()
+        val sentence_token_offsets = mutableListOf<String>()
+        val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
+        var token_index = 0
+        var sentence_index = 0
+        tokens.forEach { span ->
+            if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+                tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+                sentence_tokens.clear()
+                sentence_token_offsets.clear()
+                sentence_index++
+                token_index = 1
+
+            }
+            sentence_tokens.add(text.substring(span.from, span.to))
+            sentence_token_offsets.add("${span.from}-${span.to}")
+            token_index++
+        }
+        if (sentence_tokens.size > 0) {
+            try {
+                tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+            } catch (e: ArrayIndexOutOfBoundsException) {
+                logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+            }
+        }
+        return morphoMap
+    }
+}
+
+
+class AnnotationToolBridgeFactory {
+    companion object {
+        fun getAnnotationToolBridge(annotateWith: String, LOGGER: Logger): AnnotationToolBridge? {
+            return MarmotBridge(LOGGER)
+        }
+    }
+}
+
+class MarmotBridge(override val logger: Logger) : AnnotationToolBridge() {
+
+    val tagger: MorphTagger
+
+    init {
+        val model = "/home/kupietz/KorAP/korapxml2conllu/libs/de.marmot"
+        logger.info("Initializing MarMoT with model $model")
+        tagger = FileUtils.loadFromFile(model)
+        //tagger.setMaxLevel(100)
+        logger.info("Model $model loaded")
+    }
+
+    @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+    override fun tagSentence(
+        sentenceTokens: MutableList<String>,
+        sentenceTokenOffsets: MutableList<String>,
+        morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+    ) {
+        val sentence = Sentence(sentenceTokens.map { Word(it) })
+        var result: List<List<String>>
+        result = tagger.tag(sentence)  // LOGGER.info("Marmot tagger finished")// return
+        for (i in 0 until result.size) {
+            val taggedWord = KorapXml2Conllu.MorphoSpan(
+                xpos = result[i][0].split("|")[0],
+                feats = result[i][1]
+            )
+            morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 638cae0..f3a95e4 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -38,6 +38,7 @@
 
 class KorapXml2Conllu : Callable<Int> {
     val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null
+    val marmotBridge = null
 
     @Parameters(arity = "1..*", description = ["At least one zip file name"])
     var zipFileNames: Array<String>? = null
@@ -151,12 +152,17 @@
     val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
     val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
     var waitForMorpho: Boolean = false
-
+    var annotationToolBridge: AnnotationToolBridge? = null
     fun korapxml2conllu(args: Array<String>) {
         val executor: ExecutorService = Executors.newFixedThreadPool(threads)
 
-        if (annotateWith != "") {
-            annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
+        if (annotateWith.isNotEmpty()) {
+            if (annotateWith.contains(".jar")) {
+                LOGGER.info("Annotating with jar file: $annotateWith")
+                annotationToolBridge = AnnotationToolBridgeFactory.getAnnotationToolBridge(annotateWith, LOGGER)
+            } else {
+                annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
+            }
         }
 
         var zips: Array<String> = args
@@ -191,7 +197,7 @@
                 true
             )
         }
-        if (annotateWith.isNotEmpty()) {
+        if (annotationWorkerPool != null) {
             LOGGER.info("closing worker pool")
             annotationWorkerPool?.close()
         }
@@ -363,6 +369,9 @@
                 output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
             }
             var previousSpanStart = 0
+            if (annotationToolBridge != null) {
+                morpho[docId] = annotationToolBridge!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+            }
             tokens[docId]?.forEach { span ->
                 token_index++
                 if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -384,7 +393,7 @@
                     }
                     previousSpanStart = span.from+1
                 }
-                if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
+                if (morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
                     val mfs = morpho[docId]!!["${span.from}-${span.to}"]
 
                     output.append(
@@ -413,7 +422,7 @@
             }
         }
 
-        if (annotateWith != "") {
+        if (annotationWorkerPool != null) {
             annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString())
         } else {
             synchronized(System.out) {
diff --git a/build.gradle b/build.gradle
index e69de29..f593f80 100644
--- a/build.gradle
+++ b/build.gradle
@@ -0,0 +1,5 @@
+repositories {
+    flatDir {
+        dirs("libs")
+    }
+}