Add native treetagger support
Change-Id: I155da3c0d6b185bb718d9c685d45a3862fd9cc49
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index c7ae61d..32060d5 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -18,7 +18,7 @@
class AnnotationToolBridgeFactory {
companion object {
- const val taggerFoundries = "marmot|opennlp|corenlp"
+ const val taggerFoundries = "marmot|opennlp|corenlp|treetagger"
const val parserFoundries = "malt|corenlp"
fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
@@ -27,6 +27,7 @@
"opennlp" -> return OpenNlpBridge(model, LOGGER)
"malt" -> return MaltParserBridge(model, LOGGER)
"corenlp" -> return CoreNLPBridge(model, LOGGER)
+ "treetagger", "tree_tagger" -> return null
else -> LOGGER.severe("Unknown tagger/parser $foundry")
}
return null
@@ -38,6 +39,7 @@
"marmot" -> return MarmotBridge(model, LOGGER)
"opennlp" -> return OpenNlpBridge(model, LOGGER)
"corenlp" -> return CoreNLPTaggerBridge(model, LOGGER)
+ "treetagger", "tree_tagger" -> return null
else -> LOGGER.severe("Unknown tagger $foundry")
}
return null
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index e037ded..b2d572d 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -308,6 +308,7 @@
private var taggerName: String? = null
private var taggerModel: String? = null
+ private var dockerLogMessage: String? = null
// Store model path resolutions for logging after logger initialization
private val modelPathResolutions: MutableList<Pair<String, String>> = mutableListOf()
@@ -316,7 +317,13 @@
private val defaultTaggerModels = mapOf(
"marmot" to "de.marmot",
"opennlp" to "de-pos-maxent.bin",
- "corenlp" to "german-fast.tagger"
+ "corenlp" to "german-fast.tagger",
+ "treetagger" to "german"
+ )
+
+ data class DockerTaggerConfig(val image: String, val defaultModel: String, val defaultArgs: String)
+ private val dockerTaggers = mapOf(
+ "treetagger" to DockerTaggerConfig("korap/conllu-treetagger", "german", "-p")
)
private val defaultParserModels = mapOf(
@@ -471,32 +478,76 @@
String.format(Locale.ROOT, "Invalid value `%s' for option '--tag-with': "+
"value does not match the expected pattern ${taggerFoundries}[:<path/to/model>]", tagWith))
} else {
+
taggerName = matcher.group(1)
- val originalModelPath = matcher.group(2) ?: defaultTaggerModels[taggerName]
-
- if (originalModelPath == null) {
- throw ParameterException(spec.commandLine(),
- String.format(Locale.ROOT, "No default model available for tagger '%s'", taggerName))
- }
-
- val resolvedModelPath = resolveModelPath(originalModelPath)
- if (resolvedModelPath != null) {
- taggerModel = resolvedModelPath
- if (resolvedModelPath != originalModelPath) {
- // Store for logging after logger initialization
- modelPathResolutions.add(originalModelPath to resolvedModelPath)
+ if (dockerTaggers.containsKey(taggerName)) {
+ val config = dockerTaggers[taggerName]!!
+ val modelPart = matcher.group(2)
+
+ var model = config.defaultModel
+ var args = config.defaultArgs
+
+ if (modelPart != null) {
+ // Split by first colon to separate model and args if present
+ // Format could be: "model" or "model:args" or ":args" (if model is empty, but regex group 2 implies presence)
+ // Actually regex is (foundry)(?::(.+))? so group 2 is everything after first colon
+ // We want to support:
+ // treetagger -> default model, default args
+ // treetagger:german -> german model, default args
+ // treetagger:german:-p -x -> german model, custom args
+
+ val parts = modelPart.split(":", limit = 2)
+ if (parts.isNotEmpty() && parts[0].isNotBlank()) {
+ model = parts[0]
+ }
+ if (parts.size > 1) {
+ val customArgs = parts[1]
+ args = if (config.defaultArgs.isNotBlank()) {
+ "${config.defaultArgs} $customArgs"
+ } else {
+ customArgs
+ }
+ }
}
+
+ taggerModel = model // For logging
+
+ // Construct Docker command
+ // We assume KORAPXMLTOOL_MODELS_PATH is set in the environment or we use a default?
+ // The user request said: "docker run -v $KORAPXMLTOOL_MODELS_PATH:/local/models ..."
+ // AnnotationWorkerPool uses /bin/sh -c, so environment variables should be expanded by the shell.
+
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args -l $model"
+ dockerLogMessage = "Configured Docker tagger '$taggerName' with command: $annotateWith"
+
} else {
- val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
- val searchInfo = if (defaultModelsPath != null) {
- " (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
- } else {
- " (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
+ val originalModelPath = matcher.group(2) ?: defaultTaggerModels[taggerName]
+
+ if (originalModelPath == null) {
+ throw ParameterException(spec.commandLine(),
+ String.format(Locale.ROOT, "No default model available for tagger '%s'", taggerName))
}
- throw ParameterException(spec.commandLine(),
- String.format(Locale.ROOT, "Invalid value for option '--tag-with': "+
- "model file '%s' does not exist%s", originalModelPath, searchInfo))
+
+ val resolvedModelPath = resolveModelPath(originalModelPath)
+
+ if (resolvedModelPath != null) {
+ taggerModel = resolvedModelPath
+ if (resolvedModelPath != originalModelPath) {
+ // Store for logging after logger initialization
+ modelPathResolutions.add(originalModelPath to resolvedModelPath)
+ }
+ } else {
+ val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
+ val searchInfo = if (defaultModelsPath != null) {
+ " (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
+ } else {
+ " (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
+ }
+ throw ParameterException(spec.commandLine(),
+ String.format(Locale.ROOT, "Invalid value for option '--tag-with': "+
+ "model file '%s' does not exist%s", originalModelPath, searchInfo))
+ }
}
}
}
@@ -577,7 +628,7 @@
modelPathResolutions.forEach { (original, resolved) ->
LOGGER.info("Resolved model path '$original' to '$resolved'")
}
-
+
// Validate input files exist before doing any processing
zipFileNames?.forEach { zipFile ->
if (!File(zipFile).exists()) {
@@ -1196,6 +1247,10 @@
}
annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER, handler)
}
+
+ if (dockerLogMessage != null) {
+ LOGGER.info(dockerLogMessage)
+ }
}
var zips: Array<String> = args
@@ -3746,12 +3801,12 @@
}
}
line.startsWith("# start_offsets =") -> {
- val offsetsStr = line.substring("# start_offsets =".length).trim()
- val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
- sentenceStartOffset = allOffsets.firstOrNull()
- currentStartOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else allOffsets
- tokenIndexInSentence = 0
- }
+ val offsetsStr = line.substring("# start_offsets =".length).trim()
+ val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
+ sentenceStartOffset = allOffsets.firstOrNull()
+ currentStartOffsets = if (allOffsets.size > 1) allOffsets.drop(1) else allOffsets
+ tokenIndexInSentence = 0
+ }
line.startsWith("# end_offsets =") -> {
val offsetsStr = line.substring("# end_offsets =".length).trim()
val allOffsets = offsetsStr.split(Regex("\\s+")).mapNotNull { it.toIntOrNull() }
@@ -3782,17 +3837,17 @@
val deps = if (fields.size > 8) fields[8] else "_"
val misc = if (fields.size > 9) fields[9] else "_"
- if (currentStartOffsets != null && currentEndOffsets != null &&
- tokenIndexInSentence < currentStartOffsets.size &&
- tokenIndexInSentence < currentEndOffsets.size) {
+ if (currentStartOffsets != null && currentEndOffsets != null &&
+ tokenIndexInSentence < currentStartOffsets.size &&
+ tokenIndexInSentence < currentEndOffsets.size) {
- val spanFrom = currentStartOffsets[tokenIndexInSentence]
- val spanTo = currentEndOffsets[tokenIndexInSentence]
- val spanKey = "$spanFrom-$spanTo"
+ val spanFrom = currentStartOffsets[tokenIndexInSentence]
+ val spanTo = currentEndOffsets[tokenIndexInSentence]
+ val spanKey = "$spanFrom-$spanTo"
- morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
- tokenIndexInSentence++
- }
+ morphoSpans[spanKey] = MorphoSpan(lemma, upos, xpos, feats, head, deprel, deps, misc)
+ tokenIndexInSentence++
+ }
}
}
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt
new file mode 100644
index 0000000..be60562
--- /dev/null
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt
@@ -0,0 +1,45 @@
+package de.ids_mannheim.korapxmltools
+
+import org.junit.Test
+import kotlin.test.assertTrue
+
+class DockerTaggerTest {
+
+
+
+ @Test
+ fun testTreeTaggerArgumentAppending() {
+ val tool = KorapXmlTool()
+ // We need to inject the configuration manually or ensure it's initialized
+ // KorapXmlTool initializes dockerTaggers in its property declaration or init block?
+ // Let's assume it's available.
+
+ // We can't easily call setTagWith because it's part of the picocli parsing or a method.
+ // But we can check if we can access the dockerTaggers map and simulate the logic
+ // OR better: use the tool instance to parse args if possible, but picocli does that.
+
+ // Actually, let's just use the fact that setTagWith is a public method (from the view_file output).
+ // We need to ensure dockerTaggers is populated.
+
+ // Let's try to call setTagWith directly.
+ try {
+ tool.setTagWith("treetagger:german:-x")
+
+ // Now check the annotateWith property
+ // We need to access the private property 'annotateWith' or 'dockerLogMessage'
+ // If they are private, we might need reflection.
+
+ val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
+ annotateWithField.isAccessible = true
+ val annotateWith = annotateWithField.get(tool) as String
+
+ assertTrue(annotateWith.contains("-p -x"), "Should contain both default (-p) and custom (-x) args. Output: $annotateWith")
+ assertTrue(annotateWith.contains("-l german"), "Should contain model arg")
+
+ } catch (e: Exception) {
+ // If setTagWith fails (e.g. due to missing config), we might need to setup more.
+ // But dockerTaggers seems to be statically initialized or initialized in the class.
+ throw e
+ }
+ }
+}