Add option --tag-with marmot:<path/to/model>
Change-Id: I04db45f1ba2ebb44a938be6dd34131b448b19c1f
diff --git a/Readme.md b/Readme.md
index fa9d396..9641821 100644
--- a/Readme.md
+++ b/Readme.md
@@ -61,6 +61,20 @@
```shell script
java -jar app/build/libs/korapxml2conllu.jar -T 10 -A "docker run --rm -i korap/conllu2treetagger -l french" app/src/test/resources/wdf19.zip | conllu2korapxml wdf19.tree_tagger.zip
```
+### Tag with integrated MarMoT POS tagger
+
+```shell script
+$ java -jar ./app/build/libs/korapxml2conllu.jar -t marmot:models/de.marmot app/src/test/resources/goe.zip
+
+# foundry = base
+# filename = GOE/AGA/00000/base/tokens.xml
+# text_id = GOE_AGA.00000
+# start_offsets = 0 0 9 12
+# end_offsets = 22 8 11 22
+1 Campagne _ _ NN case=nom|number=sg|gender=fem _ _ _ _
+2 in _ _ APPR _ _ _ _ _
+3 Frankreich _ _ NE case=dat|number=sg|gender=neut _ _ _ _
+```
## Development and License
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index 6c83e08..3354211 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -10,6 +10,7 @@
import kotlin.jvm.Throws
abstract class AnnotationToolBridge {
+ abstract val model: String
abstract val logger: Logger
@Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
@@ -50,18 +51,22 @@
class AnnotationToolBridgeFactory {
companion object {
- fun getAnnotationToolBridge(annotateWith: String, LOGGER: Logger): AnnotationToolBridge? {
- return MarmotBridge(LOGGER)
+ fun getAnnotationToolBridge(taggerName: String, taggerModel: String, LOGGER: Logger): AnnotationToolBridge? {
+ if (taggerName == "marmot") {
+ return MarmotBridge(taggerModel, LOGGER)
+ } else {
+ LOGGER.warning("Unknown tagger $taggerName")
+ return null
+ }
}
}
}
-class MarmotBridge(override val logger: Logger) : AnnotationToolBridge() {
+class MarmotBridge(override val model: String, override val logger: Logger) : AnnotationToolBridge() {
val tagger: MorphTagger
init {
- val model = "/home/kupietz/KorAP/korapxml2conllu/libs/de.marmot"
logger.info("Initializing MarMoT with model $model")
tagger = FileUtils.loadFromFile(model)
//tagger.setMaxLevel(100)
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 33448a7..74e74f8 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -19,6 +19,8 @@
import java.util.logging.Level
import java.util.logging.LogManager
import java.util.logging.Logger
+import java.util.regex.Matcher
+import java.util.regex.Pattern
import java.util.stream.IntStream
import java.util.zip.ZipFile
import javax.xml.parsers.DocumentBuilder
@@ -39,6 +41,8 @@
class KorapXml2Conllu : Callable<Int> {
val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null
+ @Spec lateinit var spec : Model.CommandSpec
+
@Parameters(arity = "1..*", description = ["At least one zip file name"])
var zipFileNames: Array<String>? = null
@@ -117,6 +121,34 @@
)
var threads: Int = Runtime.getRuntime().availableProcessors() / 2
+ private var taggerName: String? = null
+ private var taggerModel: String? = null
+ @Option(
+ names = ["--tag-with", "-t"],
+ paramLabel = "TAGGER:MODEL",
+ description = ["Specify a tagger and a model: marmot:<path/to/model>."]
+ )
+ fun setTagWith(tagWith: String) {
+ if (tagWith != null) {
+ val pattern: Pattern = Pattern.compile("(marmot):(.+)")
+ val matcher: Matcher = pattern.matcher(tagWith)
+ if (!matcher.matches()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value '%s' for option '--tag-with':"+
+ "value does not match the expected pattern marmot:<path/to/model>", tagWith))
+ } else {
+ taggerName = matcher.group(1)
+ taggerModel = matcher.group(2)
+ if (!File(taggerModel).exists()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value for option '--tag-with':"+
+ "model file '%s' does not exist", taggerModel, taggerModel))
+ }
+ }
+ }
+ }
+
+
override fun call(): Int {
val handler = ConsoleHandler()
LogManager.getLogManager().reset()
@@ -156,9 +188,7 @@
val executor: ExecutorService = Executors.newFixedThreadPool(threads)
if (annotateWith.isNotEmpty()) {
- if (!annotateWith.contains(".jar")) {
- annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
- }
+ annotationWorkerPool = AnnotationWorkerPool(annotateWith, threads, LOGGER)
}
var zips: Array<String> = args
@@ -238,9 +268,9 @@
.parallel()
.forEach { zipEntry ->
LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
- if (annotateWith.contains(".jar") && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
+ if (taggerName != null && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
annotationToolBridges[Thread.currentThread().id] =
- AnnotationToolBridgeFactory.getAnnotationToolBridge(annotateWith, LOGGER)
+ AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER)
}
try {