Add native spaCy support
Change-Id: Ibd3d660d2fc27a142e8d5e013b8bbb400bff5b9c
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index 32060d5..b1c7fb9 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -18,8 +18,8 @@
class AnnotationToolBridgeFactory {
companion object {
- const val taggerFoundries = "marmot|opennlp|corenlp|treetagger"
- const val parserFoundries = "malt|corenlp"
+ const val taggerFoundries = "marmot|opennlp|corenlp|treetagger|spacy"
+ const val parserFoundries = "malt|corenlp|spacy"
fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
when (foundry) {
@@ -28,6 +28,7 @@
"malt" -> return MaltParserBridge(model, LOGGER)
"corenlp" -> return CoreNLPBridge(model, LOGGER)
"treetagger", "tree_tagger" -> return null
+ "spacy" -> return null
else -> LOGGER.severe("Unknown tagger/parser $foundry")
}
return null
@@ -40,6 +41,7 @@
"opennlp" -> return OpenNlpBridge(model, LOGGER)
"corenlp" -> return CoreNLPTaggerBridge(model, LOGGER)
"treetagger", "tree_tagger" -> return null
+ "spacy" -> return null
else -> LOGGER.severe("Unknown tagger $foundry")
}
return null
@@ -50,6 +52,7 @@
when (foundry) {
"malt" -> return MaltParserBridge(model, LOGGER)
"corenlp" -> return CoreNLPBridge(model, LOGGER, taggerModel)
+ "spacy" -> return null
else -> LOGGER.severe("Unknown parser $foundry")
}
return null
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index b2d572d..01cf7dc 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -92,7 +92,13 @@
" ./build/bin/korapxmltool -t zip -T marmot:de.marmot -P malt:german.mco app/src/test/resources/goe.zip",
" # (uses KORAPXMLTOOL_MODELS_PATH if model not found in current directory)",
"",
- " Use external spaCy annotation (without dependencies):",
+ " Native Docker spaCy tagging (without dependencies):",
+ " ./build/bin/korapxmltool -t zip -T spacy app/src/test/resources/goe.zip",
+ "",
+ " Native Docker spaCy tagging and dependency parsing:",
+ " ./build/bin/korapxmltool -t zip -P spacy app/src/test/resources/goe.zip",
+ "",
+ " Use external spaCy annotation (legacy method):",
" ./build/bin/korapxmltool -j4 -A \"docker run -e SPACY_USE_DEPENDENCIES=False --rm -i korap/conllu2spacy:latest\" -t zip ./app/src/test/resources/goe.zip",
"",
" Generate Krill tar from wud24_sample with multiple annotation foundries:",
@@ -323,12 +329,14 @@
data class DockerTaggerConfig(val image: String, val defaultModel: String, val defaultArgs: String)
private val dockerTaggers = mapOf(
- "treetagger" to DockerTaggerConfig("korap/conllu-treetagger", "german", "-p")
+ "treetagger" to DockerTaggerConfig("korap/conllu-treetagger", "german", "-p"),
+ "spacy" to DockerTaggerConfig("korap/conllu-spacy", "de_core_news_lg", "")
)
private val defaultParserModels = mapOf(
"malt" to "german.mco",
- "corenlp" to "germanSR.ser.gz"
+ "corenlp" to "germanSR.ser.gz",
+ "spacy" to "de_core_news_lg"
)
// Calculate optimal thread count based on format, memory, and input characteristics
@@ -467,7 +475,7 @@
names = ["-T", "--tag-with"],
paramLabel = "TAGGER[:MODEL]",
description = ["Specify a tagger and optionally a model: ${taggerFoundries}[:<path/to/model>].",
- "If model is omitted, defaults are: marmot→de.marmot, opennlp→de-pos-maxent.bin, corenlp→german-fast.tagger"]
+ "If model is omitted, defaults are: marmot→de.marmot, opennlp→de-pos-maxent.bin, corenlp→german-fast.tagger, treetagger→german, spacy→de_core_news_lg"]
)
fun setTagWith(tagWith: String) {
// Pattern now makes the model part optional
@@ -518,7 +526,16 @@
// The user request said: "docker run -v $KORAPXMLTOOL_MODELS_PATH:/local/models ..."
// AnnotationWorkerPool uses /bin/sh -c, so environment variables should be expanded by the shell.
- annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args -l $model"
+ // Handle different Docker command formats
+ if (taggerName == "spacy") {
+ // spaCy uses -m for model and -d to disable dependencies (tagging only)
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} -m $model -d"
+ } else if (taggerName == "treetagger") {
+ // TreeTagger uses -l for language/model and -p in args
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args -l $model"
+ } else {
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args $model"
+ }
dockerLogMessage = "Configured Docker tagger '$taggerName' with command: $annotateWith"
} else {
@@ -558,7 +575,7 @@
names = ["-P", "--parse-with"],
paramLabel = "PARSER[:MODEL]",
description = ["Specify a parser and optionally a model: ${parserFoundries}[:<path/to/model>].",
- "If model is omitted, defaults are: malt→german.mco, corenlp→germanSR.ser.gz"]
+ "If model is omitted, defaults are: malt→german.mco, corenlp→germanSR.ser.gz, spacy→de_core_news_lg"]
)
fun setParseWith(parseWith: String) {
// Pattern now makes the model part optional
@@ -570,31 +587,68 @@
"value does not match the expected pattern ${parserFoundries}[:<path/to/model>]", parseWith))
} else {
parserName = matcher.group(1)
- val originalModelPath = matcher.group(2) ?: defaultParserModels[parserName]
-
- if (originalModelPath == null) {
- throw ParameterException(spec.commandLine(),
- String.format(Locale.ROOT, "No default model available for parser '%s'", parserName))
- }
-
- val resolvedModelPath = resolveModelPath(originalModelPath)
- if (resolvedModelPath != null) {
- parserModel = resolvedModelPath
- if (resolvedModelPath != originalModelPath) {
- // Store for logging after logger initialization
- modelPathResolutions.add(originalModelPath to resolvedModelPath)
+ // Handle Docker parsers (like spaCy)
+ if (dockerTaggers.containsKey(parserName)) {
+ val config = dockerTaggers[parserName]!!
+ val modelPart = matcher.group(2)
+
+ var model = config.defaultModel
+ var args = config.defaultArgs
+
+ if (modelPart != null) {
+ val parts = modelPart.split(":", limit = 2)
+ if (parts.isNotEmpty() && parts[0].isNotBlank()) {
+ model = parts[0]
+ }
+ if (parts.size > 1) {
+ val customArgs = parts[1]
+ args = if (config.defaultArgs.isNotBlank()) {
+ "${config.defaultArgs} $customArgs"
+ } else {
+ customArgs
+ }
+ }
}
- } else {
- val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
- val searchInfo = if (defaultModelsPath != null) {
- " (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
+
+ parserModel = model // For logging
+
+ // For spaCy parsing, do NOT add -d flag (parsing is enabled by default)
+ if (parserName == "spacy") {
+ // spaCy uses -m for model, no -d flag for parsing mode
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} -m $model"
} else {
- " (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
+ annotateWith = "docker run -v \${KORAPXMLTOOL_MODELS_PATH:-.}:/local/models --rm -i ${config.image} $args $model"
}
- throw ParameterException(spec.commandLine(),
- String.format(Locale.ROOT, "Invalid value for option '--parse-with': "+
- "model file '%s' does not exist%s", originalModelPath, searchInfo))
+ dockerLogMessage = "Configured Docker parser '$parserName' with command: $annotateWith"
+
+ } else {
+ val originalModelPath = matcher.group(2) ?: defaultParserModels[parserName]
+
+ if (originalModelPath == null) {
+ throw ParameterException(spec.commandLine(),
+ String.format(Locale.ROOT, "No default model available for parser '%s'", parserName))
+ }
+
+ val resolvedModelPath = resolveModelPath(originalModelPath)
+
+ if (resolvedModelPath != null) {
+ parserModel = resolvedModelPath
+ if (resolvedModelPath != originalModelPath) {
+ // Store for logging after logger initialization
+ modelPathResolutions.add(originalModelPath to resolvedModelPath)
+ }
+ } else {
+ val defaultModelsPath = System.getenv("KORAPXMLTOOL_MODELS_PATH")
+ val searchInfo = if (defaultModelsPath != null) {
+ " (searched in current directory and KORAPXMLTOOL_MODELS_PATH='$defaultModelsPath')"
+ } else {
+ " (searched in current directory; KORAPXMLTOOL_MODELS_PATH defaults to ../lib/models relative to executable)"
+ }
+ throw ParameterException(spec.commandLine(),
+ String.format(Locale.ROOT, "Invalid value for option '--parse-with': "+
+ "model file '%s' does not exist%s", originalModelPath, searchInfo))
+ }
}
}
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt
index be60562..d96ec3a 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/DockerTaggerTest.kt
@@ -2,10 +2,56 @@
import org.junit.Test
import kotlin.test.assertTrue
+import kotlin.test.assertFalse
class DockerTaggerTest {
+ @Test
+ fun testSpacyTaggerConfiguration() {
+ val tool = KorapXmlTool()
+
+ // Test spaCy tagging (should add -d flag to disable parsing)
+ tool.setTagWith("spacy")
+
+ val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
+ annotateWithField.isAccessible = true
+ val annotateWith = annotateWithField.get(tool) as String
+
+ assertTrue(annotateWith.contains("-d"), "spaCy tagger should contain -d flag to disable parsing. Output: $annotateWith")
+ assertTrue(annotateWith.contains("-m de_core_news_lg"), "Should contain default model")
+ assertTrue(annotateWith.contains("korap/conllu-spacy"), "Should use correct Docker image")
+ }
+ @Test
+ fun testSpacyParserConfiguration() {
+ val tool = KorapXmlTool()
+
+ // Test spaCy parsing (should NOT add -d flag)
+ tool.setParseWith("spacy")
+
+ val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
+ annotateWithField.isAccessible = true
+ val annotateWith = annotateWithField.get(tool) as String
+
+ assertFalse(annotateWith.contains("-d"), "spaCy parser should NOT contain -d flag. Output: $annotateWith")
+ assertTrue(annotateWith.contains("-m de_core_news_lg"), "Should contain default model")
+ assertTrue(annotateWith.contains("korap/conllu-spacy"), "Should use correct Docker image")
+ }
+
+ @Test
+ fun testSpacyCustomModel() {
+ val tool = KorapXmlTool()
+
+ // Test spaCy with custom model
+ tool.setTagWith("spacy:de_core_news_sm")
+
+ val annotateWithField = KorapXmlTool::class.java.getDeclaredField("annotateWith")
+ annotateWithField.isAccessible = true
+ val annotateWith = annotateWithField.get(tool) as String
+
+ assertTrue(annotateWith.contains("-m de_core_news_sm"), "Should contain custom model. Output: $annotateWith")
+ assertTrue(annotateWith.contains("-d"), "Should still contain -d flag for tagging")
+ }
@Test
fun testTreeTaggerArgumentAppending() {