Provide a direct bridge to java based annotation tools
Change-Id: Ie8c1a66310229f454da1e45b50ecc689d07790cf
diff --git a/app/build.gradle b/app/build.gradle
index 7337882..31cfdc9 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -44,6 +44,9 @@
// Use the Kotlin JUnit integration.
testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'
testImplementation "org.jetbrains.kotlin:kotlin-test:2.0.0-Beta4"
+
+ implementation fileTree(dir: '../libs', include: ['*.jar'])
+ //implementation file('libs/marmot.jar')
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
new file mode 100644
index 0000000..6c83e08
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -0,0 +1,89 @@
+package de.ids_mannheim.korapxmltools
+
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import marmot.morph.MorphTagger
+import marmot.morph.Sentence
+import marmot.morph.Word
+import marmot.util.FileUtils
+import java.util.logging.Logger
+import kotlin.jvm.Throws
+
+abstract class AnnotationToolBridge {
+ abstract val logger: Logger
+
+ @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+ abstract fun tagSentence(sentenceTokens: MutableList<String>, sentenceTokenOffsets: MutableList<String>, morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?)
+
+ fun tagText(tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+ val coroutineScope = CoroutineScope(Dispatchers.Default)
+
+ val sentence_tokens = mutableListOf<String>()
+ val sentence_token_offsets = mutableListOf<String>()
+ val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
+ var token_index = 0
+ var sentence_index = 0
+ tokens.forEach { span ->
+ if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+ tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+ sentence_tokens.clear()
+ sentence_token_offsets.clear()
+ sentence_index++
+ token_index = 1
+
+ }
+ sentence_tokens.add(text.substring(span.from, span.to))
+ sentence_token_offsets.add("${span.from}-${span.to}")
+ token_index++
+ }
+ if (sentence_tokens.size > 0) {
+ try {
+ tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+ } catch (e: ArrayIndexOutOfBoundsException) {
+ logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+ }
+ }
+ return morphoMap
+ }
+}
+
+
+class AnnotationToolBridgeFactory {
+ companion object {
+ fun getAnnotationToolBridge(annotateWith: String, LOGGER: Logger): AnnotationToolBridge? {
+ return MarmotBridge(LOGGER)
+ }
+ }
+}
+
+class MarmotBridge(override val logger: Logger) : AnnotationToolBridge() {
+
+ val tagger: MorphTagger
+
+ init {
+ val model = "/home/kupietz/KorAP/korapxml2conllu/libs/de.marmot"
+ logger.info("Initializing MarMoT with model $model")
+ tagger = FileUtils.loadFromFile(model)
+ //tagger.setMaxLevel(100)
+ logger.info("Model $model loaded")
+ }
+
+ @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+ override fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ ) {
+ val sentence = Sentence(sentenceTokens.map { Word(it) })
+ var result: List<List<String>>
+ result = tagger.tag(sentence) // LOGGER.info("Marmot tagger finished")// return
+ for (i in 0 until result.size) {
+ val taggedWord = KorapXml2Conllu.MorphoSpan(
+ xpos = result[i][0].split("|")[0],
+ feats = result[i][1]
+ )
+ morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 638cae0..f3a95e4 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -38,6 +38,7 @@
class KorapXml2Conllu : Callable<Int> {
val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null
+ val marmotBridge = null
@Parameters(arity = "1..*", description = ["At least one zip file name"])
var zipFileNames: Array<String>? = null
@@ -151,12 +152,17 @@
val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
var waitForMorpho: Boolean = false
-
+ var annotationToolBridge: AnnotationToolBridge? = null
fun korapxml2conllu(args: Array<String>) {
val executor: ExecutorService = Executors.newFixedThreadPool(threads)
- if (annotateWith != "") {
- annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
+ if (annotateWith.isNotEmpty()) {
+ if (annotateWith.contains(".jar")) {
+ LOGGER.info("Annotating with jar file: $annotateWith")
+ annotationToolBridge = AnnotationToolBridgeFactory.getAnnotationToolBridge(annotateWith, LOGGER)
+ } else {
+ annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
+ }
}
var zips: Array<String> = args
@@ -191,7 +197,7 @@
true
)
}
- if (annotateWith.isNotEmpty()) {
+ if (annotationWorkerPool != null) {
LOGGER.info("closing worker pool")
annotationWorkerPool?.close()
}
@@ -363,6 +369,9 @@
output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
}
var previousSpanStart = 0
+ if (annotationToolBridge != null) {
+ morpho[docId] = annotationToolBridge!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+ }
tokens[docId]?.forEach { span ->
token_index++
if (span.from >= sentences[docId]!![sentence_index].to) {
@@ -384,7 +393,7 @@
}
previousSpanStart = span.from+1
}
- if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
+ if (morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
val mfs = morpho[docId]!!["${span.from}-${span.to}"]
output.append(
@@ -413,7 +422,7 @@
}
}
- if (annotateWith != "") {
+ if (annotationWorkerPool != null) {
annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString())
} else {
synchronized(System.out) {
diff --git a/build.gradle b/build.gradle
index e69de29..f593f80 100644
--- a/build.gradle
+++ b/build.gradle
@@ -0,0 +1,5 @@
+repositories {
+ flatDir {
+ dirs("libs")
+ }
+}