Add --parse-with malt:</path/to/model> option
Change-Id: I3d0691241b3b4df0abd3c9f02a1f29d23bdd030f
diff --git a/app/build.gradle b/app/build.gradle
index c42b4ba..7c2b5e1 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -45,6 +45,7 @@
testImplementation "org.jetbrains.kotlin:kotlin-test:2.0.0-Beta5"
implementation 'com.github.kupietz:cistern:v1.0.2'
+ implementation 'org.maltparser:maltparser:1.9.2'
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index 3354211..cfa19e7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -1,24 +1,31 @@
package de.ids_mannheim.korapxmltools
-import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Dispatchers
import marmot.morph.MorphTagger
import marmot.morph.Sentence
import marmot.morph.Word
import marmot.util.FileUtils
+import org.maltparser.MaltParserService
+import org.maltparser.core.exception.MaltChainedException
+import org.maltparser.core.syntaxgraph.DependencyStructure
import java.util.logging.Logger
-import kotlin.jvm.Throws
-abstract class AnnotationToolBridge {
- abstract val model: String
- abstract val logger: Logger
+interface AnnotationToolBridge {
+ val model: String
+ val logger: Logger
@Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
- abstract fun tagSentence(sentenceTokens: MutableList<String>, sentenceTokenOffsets: MutableList<String>, morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?)
+ fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ )
+}
- fun tagText(tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
- val coroutineScope = CoroutineScope(Dispatchers.Default)
+abstract class TaggerToolBridge : AnnotationToolBridge {
+ fun tagText(
+ tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
+ ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
val sentence_tokens = mutableListOf<String>()
val sentence_token_offsets = mutableListOf<String>()
val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
@@ -48,22 +55,118 @@
}
}
+abstract class ParserToolBridge : AnnotationToolBridge {
+ fun parseText(
+ tokens: Array<KorapXml2Conllu.Span>,
+ morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
+ sentenceSpans: Array<KorapXml2Conllu.Span>?,
+ text: String
+ ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+ val sentence_tokens = mutableListOf<String>()
+ val sentence_token_offsets = mutableListOf<String>()
+ var token_index = 1
+ var sentence_index = 0
+ tokens.forEach { span ->
+ if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+ tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+ sentence_tokens.clear()
+ sentence_token_offsets.clear()
+ sentence_index++
+ token_index = 1
+
+ }
+ sentence_tokens.add(
+ "$token_index\t${
+ text.substring(
+ span.from, span.to
+ )
+ }\t_\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${
+ morpho?.get(
+ "${span.from}-${span.to}"
+ )?.feats ?: "_"
+ }\t_\t_\t_\t_"
+ )
+ sentence_token_offsets.add("${span.from}-${span.to}")
+ token_index++
+ }
+ if (sentence_tokens.size > 0) {
+ try {
+ tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+ } catch (e: ArrayIndexOutOfBoundsException) {
+ logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+ }
+ }
+ return morpho!!
+ }
+}
+
class AnnotationToolBridgeFactory {
companion object {
- fun getAnnotationToolBridge(taggerName: String, taggerModel: String, LOGGER: Logger): AnnotationToolBridge? {
- if (taggerName == "marmot") {
- return MarmotBridge(taggerModel, LOGGER)
- } else {
- LOGGER.warning("Unknown tagger $taggerName")
- return null
+ const val taggerFoundries = "marmot"
+ const val parserFoundries = "malt"
+
+ fun getAnnotationToolBridge(foundry: String, model: String, LOGGER: Logger): AnnotationToolBridge? {
+ when (foundry) {
+ "marmot" -> return MarmotBridge(model, LOGGER)
+ "malt" -> return MaltParserBridge(model, LOGGER)
+ else -> LOGGER.severe("Unknown tagger $foundry")
+ }
+ return null
+ }
+ }
+}
+
+class MaltParserBridge(override val model: String, override val logger: Logger) : ParserToolBridge() {
+ companion object {
+ fun getFoundry(): String {
+ return "malt"
+ }
+ }
+
+ val tagger: MaltParserService
+
+ init {
+ logger.info("Initializing MaltParser with model $model")
+ tagger = MaltParserService()
+ if (model.contains("/")) {
+ val dirName = model.substringBeforeLast("/")
+ val modelName = model.substringAfterLast("/")
+ logger.info("Loading model $modelName from $dirName")
+ tagger.initializeParserModel("-w $dirName -c $modelName -m parse")
+ } else {
+ tagger.initializeParserModel("-c $model -m parse")
+ }
+ logger.info("Model $model loaded")
+ }
+
+
+ @Throws(MaltChainedException::class)
+ override fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ ) {
+ val result = tagger.parse(sentenceTokens.toTypedArray())
+
+ (result as DependencyStructure).edges.forEach { edge ->
+ val from = edge.source.index
+ val head = edge.target.index
+ val label = edge.toString()
+ if (label.contains("DEPREL:")) {
+ val rel = edge.toString().substringAfter("DEPREL:")
+ val old = morpho?.get(sentenceTokenOffsets[head - 1])
+ morpho?.set(
+ sentenceTokenOffsets[head - 1], KorapXml2Conllu.MorphoSpan(
+ lemma = old?.lemma, xpos = old?.xpos, feats = old?.feats, head = from.toString(), deprel = rel
+ )
+ )
}
}
}
}
-class MarmotBridge(override val model: String, override val logger: Logger) : AnnotationToolBridge() {
-
+class MarmotBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
val tagger: MorphTagger
init {
@@ -84,8 +187,7 @@
result = tagger.tag(sentence) // LOGGER.info("Marmot tagger finished")// return
for (i in 0 until result.size) {
val taggedWord = KorapXml2Conllu.MorphoSpan(
- xpos = result[i][0].split("|")[0],
- feats = result[i][1]
+ xpos = result[i][0].split("|")[0], feats = result[i][1]
)
morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 5bf3fc5..a7ad96a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -1,5 +1,7 @@
package de.ids_mannheim.korapxmltools
+import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries
+import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries
import org.w3c.dom.Document
import org.w3c.dom.Element
import org.w3c.dom.NodeList
@@ -23,9 +25,9 @@
import java.util.regex.Pattern
import java.util.stream.IntStream
import java.util.zip.ZipFile
+import javax.swing.text.html.parser.Parser
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory
-import kotlin.math.max
import kotlin.math.min
import kotlin.system.exitProcess
@@ -134,16 +136,16 @@
@Option(
names = ["--tag-with", "-t"],
paramLabel = "TAGGER:MODEL",
- description = ["Specify a tagger and a model: marmot:<path/to/model>."]
+ description = ["Specify a tagger and a model: ${taggerFoundries}:<path/to/model>."]
)
fun setTagWith(tagWith: String) {
if (tagWith != null) {
- val pattern: Pattern = Pattern.compile("(marmot):(.+)")
+ val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)")
val matcher: Matcher = pattern.matcher(tagWith)
if (!matcher.matches()) {
throw ParameterException(spec.commandLine(),
String.format("Invalid value `%s' for option '--tag-with': "+
- "value does not match the expected pattern marmot:<path/to/model>", tagWith))
+ "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith))
} else {
taggerName = matcher.group(1)
taggerModel = matcher.group(2)
@@ -156,6 +158,33 @@
}
}
+ private var parserName: String? = null
+ private var parserModel: String? = null
+ @Option(
+ names = ["--parse-with", "-P"],
+ paramLabel = "parser:MODEL",
+ description = ["Specify a parser and a model: ${parserFoundries}:<path/to/model>."]
+ )
+ fun setParseWith(parseWith: String) {
+ if (parseWith != null) {
+ val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)")
+ val matcher: Matcher = pattern.matcher(parseWith)
+ if (!matcher.matches()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value `%s' for option '--parse-with': "+
+ "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith))
+ } else {
+ parserName = matcher.group(1)
+ parserModel = matcher.group(2)
+ if (!File(parserModel).exists()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value for option '--parse-with':"+
+ "model file '%s' does not exist", parserModel, parserModel))
+ }
+ }
+ }
+ }
+
override fun call(): Int {
val handler = ConsoleHandler()
@@ -191,9 +220,10 @@
val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
var waitForMorpho: Boolean = false
- var annotationToolBridges: ConcurrentHashMap<Long, AnnotationToolBridge?> = ConcurrentHashMap()
+ var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
+ var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
fun korapxml2conllu(args: Array<String>) {
- val executor: ExecutorService = Executors.newFixedThreadPool(maxThreads)
+ Executors.newFixedThreadPool(maxThreads)
if (annotateWith.isNotEmpty()) {
annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)
@@ -274,9 +304,13 @@
fun processZipEntry(zipFile: ZipFile, foundry: String, zipEntry: java.util.zip.ZipEntry) {
LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
- if (taggerName != null && !annotationToolBridges.containsKey(Thread.currentThread().id)) {
- annotationToolBridges[Thread.currentThread().id] =
- AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER)
+ if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
+ taggerToolBridges[Thread.currentThread().id] =
+ AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+ }
+ if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().id)) {
+ parserToolBridges[Thread.currentThread().id] =
+ AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?
}
try {
@@ -405,8 +439,11 @@
output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "")
}
var previousSpanStart = 0
- if (annotationToolBridges[Thread.currentThread().id] != null) {
- morpho[docId] = annotationToolBridges[Thread.currentThread().id]!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+ if (taggerToolBridges[Thread.currentThread().id] != null) {
+ morpho[docId] = taggerToolBridges[Thread.currentThread().id]!!.tagText(tokens[docId]!!, sentences[docId], texts[docId]!!)
+ if (parserToolBridges[Thread.currentThread().id] != null) {
+ morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(tokens[docId]!!, morpho[docId], sentences[docId], texts[docId]!!)
+ }
}
tokens[docId]?.forEach { span ->
token_index++