Add internal OpenNLP POS tagger
Change-Id: I4e177ac4465f571b5376924736f278ebd019a0ab
diff --git a/app/.idea/copilot/chatSessions/blobs/version b/app/.idea/copilot/chatSessions/blobs/version
new file mode 100644
index 0000000..720d64f
--- /dev/null
+++ b/app/.idea/copilot/chatSessions/blobs/version
Binary files differ
diff --git a/app/build.gradle b/app/build.gradle
index 693a7be..2f74157 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -52,9 +52,12 @@
implementation 'com.github.kupietz:cistern:v1.0.4'
implementation 'org.maltparser:maltparser:1.9.2'
+ implementation 'org.apache.opennlp:opennlp-tools:2.3.2'
+ implementation 'org.slf4j:slf4j-simple:2.1.0-alpha1'
}
+
application {
// Define the main class for the application.
mainClass = 'de.ids_mannheim.korapxmltools.KorapXml2ConlluKt'
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index 4e81986..ab12d60 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -18,12 +18,13 @@
class AnnotationToolBridgeFactory {
companion object {
- const val taggerFoundries = "marmot"
+ const val taggerFoundries = "marmot|opennlp"
const val parserFoundries = "malt"
fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
when (foundry) {
"marmot" -> return MarmotBridge(model, LOGGER)
+ "opennlp" -> return OpenNlpBridge(model, LOGGER)
"malt" -> return MaltParserBridge(model, LOGGER)
else -> LOGGER.severe("Unknown tagger $foundry")
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 763ecaf..104572c 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -283,6 +283,7 @@
}
private fun processZipFile(zipFilePath: String, foundry: String = "base") {
+ LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
if (zipFilePath.hasCorrespondingBaseZip()) {
val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
Arrays.stream(zips).parallel().forEach { zip ->
@@ -304,6 +305,7 @@
}
private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
+ LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
if (zipFilePath.hasCorrespondingBaseZip()) {
val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
Arrays.stream(zips).parallel().forEach { zip ->
@@ -328,7 +330,7 @@
fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {
var foundry = _foundry
var waitForMorpho = passedWaitForMorpho
- LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
+ LOGGER.finer("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
taggerToolBridges[Thread.currentThread().id] = tagger
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/OpenNlpBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/OpenNlpBridge.kt
new file mode 100644
index 0000000..694074e
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/OpenNlpBridge.kt
@@ -0,0 +1,51 @@
+package de.ids_mannheim.korapxmltools
+
+import opennlp.tools.postag.POSModel
+import opennlp.tools.postag.POSTaggerME
+import java.io.File
+import java.util.*
+import java.util.logging.Logger
+
+
+class OpenNlpBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
+
+ override val foundry = "opennlp"
+ val tagger: POSTaggerME
+
+ companion object {
+ var POSmodel : POSModel? = null
+ }
+
+ init {
+
+ synchronized(model) {
+ if (POSmodel == null) {
+ logger.info("Initializing OpenNLP with model $model")
+ POSmodel = POSModel(File(model as String).inputStream())
+ logger.info("Model $model loaded")
+ }
+ }
+
+ tagger = POSTaggerME(POSmodel)
+
+ }
+
+ override fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ ) {
+
+ // Perform POS tagging
+ val result = tagger.tag(sentenceTokens.toTypedArray())
+ val probs = tagger.probs()
+ for (i in 0 until result.size) {
+ val taggedWord = KorapXml2Conllu.MorphoSpan(
+ xpos = result[i],
+ misc = String.format(locale = Locale.ROOT, "%.5f", probs[i])
+ )
+ morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
+ }
+ }
+
+}
\ No newline at end of file