Extract annotation classes
Change-Id: I7e3d140fa0942e084c3da5be210ef3e44b74e798
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
index cfa19e7..4e81986 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationToolBridge.kt
@@ -1,15 +1,9 @@
package de.ids_mannheim.korapxmltools
-import marmot.morph.MorphTagger
-import marmot.morph.Sentence
-import marmot.morph.Word
-import marmot.util.FileUtils
-import org.maltparser.MaltParserService
-import org.maltparser.core.exception.MaltChainedException
-import org.maltparser.core.syntaxgraph.DependencyStructure
import java.util.logging.Logger
interface AnnotationToolBridge {
+ val foundry: String
val model: String
val logger: Logger
@@ -21,85 +15,6 @@
)
}
-abstract class TaggerToolBridge : AnnotationToolBridge {
-
- fun tagText(
- tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
- ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
- val sentence_tokens = mutableListOf<String>()
- val sentence_token_offsets = mutableListOf<String>()
- val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
- var token_index = 0
- var sentence_index = 0
- tokens.forEach { span ->
- if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
- tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
- sentence_tokens.clear()
- sentence_token_offsets.clear()
- sentence_index++
- token_index = 1
-
- }
- sentence_tokens.add(text.substring(span.from, span.to))
- sentence_token_offsets.add("${span.from}-${span.to}")
- token_index++
- }
- if (sentence_tokens.size > 0) {
- try {
- tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
- } catch (e: ArrayIndexOutOfBoundsException) {
- logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
- }
- }
- return morphoMap
- }
-}
-
-abstract class ParserToolBridge : AnnotationToolBridge {
- fun parseText(
- tokens: Array<KorapXml2Conllu.Span>,
- morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
- sentenceSpans: Array<KorapXml2Conllu.Span>?,
- text: String
- ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
- val sentence_tokens = mutableListOf<String>()
- val sentence_token_offsets = mutableListOf<String>()
- var token_index = 1
- var sentence_index = 0
- tokens.forEach { span ->
- if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
- tagSentence(sentence_tokens, sentence_token_offsets, morpho)
- sentence_tokens.clear()
- sentence_token_offsets.clear()
- sentence_index++
- token_index = 1
-
- }
- sentence_tokens.add(
- "$token_index\t${
- text.substring(
- span.from, span.to
- )
- }\t_\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${
- morpho?.get(
- "${span.from}-${span.to}"
- )?.feats ?: "_"
- }\t_\t_\t_\t_"
- )
- sentence_token_offsets.add("${span.from}-${span.to}")
- token_index++
- }
- if (sentence_tokens.size > 0) {
- try {
- tagSentence(sentence_tokens, sentence_token_offsets, morpho)
- } catch (e: ArrayIndexOutOfBoundsException) {
- logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
- }
- }
- return morpho!!
- }
-}
-
class AnnotationToolBridgeFactory {
companion object {
@@ -117,80 +32,3 @@
}
}
-class MaltParserBridge(override val model: String, override val logger: Logger) : ParserToolBridge() {
- companion object {
- fun getFoundry(): String {
- return "malt"
- }
- }
-
- val tagger: MaltParserService
-
- init {
- logger.info("Initializing MaltParser with model $model")
- tagger = MaltParserService()
- if (model.contains("/")) {
- val dirName = model.substringBeforeLast("/")
- val modelName = model.substringAfterLast("/")
- logger.info("Loading model $modelName from $dirName")
- tagger.initializeParserModel("-w $dirName -c $modelName -m parse")
- } else {
- tagger.initializeParserModel("-c $model -m parse")
- }
- logger.info("Model $model loaded")
- }
-
-
- @Throws(MaltChainedException::class)
- override fun tagSentence(
- sentenceTokens: MutableList<String>,
- sentenceTokenOffsets: MutableList<String>,
- morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
- ) {
- val result = tagger.parse(sentenceTokens.toTypedArray())
-
- (result as DependencyStructure).edges.forEach { edge ->
- val from = edge.source.index
- val head = edge.target.index
- val label = edge.toString()
- if (label.contains("DEPREL:")) {
- val rel = edge.toString().substringAfter("DEPREL:")
- val old = morpho?.get(sentenceTokenOffsets[head - 1])
- morpho?.set(
- sentenceTokenOffsets[head - 1], KorapXml2Conllu.MorphoSpan(
- lemma = old?.lemma, xpos = old?.xpos, feats = old?.feats, head = from.toString(), deprel = rel
- )
- )
- }
- }
- }
-}
-
-class MarmotBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
- val tagger: MorphTagger
-
- init {
- logger.info("Initializing MarMoT with model $model")
- tagger = FileUtils.loadFromFile(model)
- //tagger.setMaxLevel(100)
- logger.info("Model $model loaded")
- }
-
- @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
- override fun tagSentence(
- sentenceTokens: MutableList<String>,
- sentenceTokenOffsets: MutableList<String>,
- morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
- ) {
- val sentence = Sentence(sentenceTokens.map { Word(it) })
- var result: List<List<String>>
- result = tagger.tag(sentence) // LOGGER.info("Marmot tagger finished")// return
- for (i in 0 until result.size) {
- val taggedWord = KorapXml2Conllu.MorphoSpan(
- xpos = result[i][0].split("|")[0], feats = result[i][1]
- )
- morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
- }
- }
-
-}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index a7ad96a..022923a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -15,7 +15,6 @@
import java.util.*
import java.util.concurrent.Callable
import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.logging.ConsoleHandler
import java.util.logging.Level
@@ -25,7 +24,6 @@
import java.util.regex.Pattern
import java.util.stream.IntStream
import java.util.zip.ZipFile
-import javax.swing.text.html.parser.Parser
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory
import kotlin.math.min
@@ -139,21 +137,19 @@
description = ["Specify a tagger and a model: ${taggerFoundries}:<path/to/model>."]
)
fun setTagWith(tagWith: String) {
- if (tagWith != null) {
- val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)")
- val matcher: Matcher = pattern.matcher(tagWith)
- if (!matcher.matches()) {
+ val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)")
+ val matcher: Matcher = pattern.matcher(tagWith)
+ if (!matcher.matches()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value `%s' for option '--tag-with': "+
+ "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith))
+ } else {
+ taggerName = matcher.group(1)
+ taggerModel = matcher.group(2)
+ if (!File(taggerModel).exists()) {
throw ParameterException(spec.commandLine(),
- String.format("Invalid value `%s' for option '--tag-with': "+
- "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith))
- } else {
- taggerName = matcher.group(1)
- taggerModel = matcher.group(2)
- if (!File(taggerModel).exists()) {
- throw ParameterException(spec.commandLine(),
- String.format("Invalid value for option '--tag-with':"+
- "model file '%s' does not exist", taggerModel, taggerModel))
- }
+ String.format("Invalid value for option '--tag-with':"+
+ "model file '%s' does not exist", taggerModel, taggerModel))
}
}
}
@@ -166,21 +162,19 @@
description = ["Specify a parser and a model: ${parserFoundries}:<path/to/model>."]
)
fun setParseWith(parseWith: String) {
- if (parseWith != null) {
- val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)")
- val matcher: Matcher = pattern.matcher(parseWith)
- if (!matcher.matches()) {
+ val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)")
+ val matcher: Matcher = pattern.matcher(parseWith)
+ if (!matcher.matches()) {
+ throw ParameterException(spec.commandLine(),
+ String.format("Invalid value `%s' for option '--parse-with': "+
+ "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith))
+ } else {
+ parserName = matcher.group(1)
+ parserModel = matcher.group(2)
+ if (!File(parserModel).exists()) {
throw ParameterException(spec.commandLine(),
- String.format("Invalid value `%s' for option '--parse-with': "+
- "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith))
- } else {
- parserName = matcher.group(1)
- parserModel = matcher.group(2)
- if (!File(parserModel).exists()) {
- throw ParameterException(spec.commandLine(),
- String.format("Invalid value for option '--parse-with':"+
- "model file '%s' does not exist", parserModel, parserModel))
- }
+ String.format("Invalid value for option '--parse-with':"+
+ "model file '%s' does not exist", parserModel, parserModel))
}
}
}
@@ -302,15 +296,23 @@
}
}
- fun processZipEntry(zipFile: ZipFile, foundry: String, zipEntry: java.util.zip.ZipEntry) {
+ fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: java.util.zip.ZipEntry) {
+ var foundry = _foundry
LOGGER.info("Processing ${zipEntry.name} in thread ${Thread.currentThread().id}")
if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().id)) {
- taggerToolBridges[Thread.currentThread().id] =
- AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+ val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?
+ taggerToolBridges[Thread.currentThread().id] = tagger
+ if (tagger != null) {
+ foundry = tagger.foundry
+ }
+
}
if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().id)) {
- parserToolBridges[Thread.currentThread().id] =
- AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?
+ val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?
+ parserToolBridges[Thread.currentThread().id] = parser
+ if (parser != null) {
+ foundry = "$foundry dependency:${parser.foundry}"
+ }
}
try {
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MaltParserBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MaltParserBridge.kt
new file mode 100644
index 0000000..da8fb05
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MaltParserBridge.kt
@@ -0,0 +1,52 @@
+package de.ids_mannheim.korapxmltools
+
+import org.maltparser.MaltParserService
+import org.maltparser.core.exception.MaltChainedException
+import org.maltparser.core.syntaxgraph.DependencyStructure
+import java.util.logging.Logger
+
+class MaltParserBridge(override val model: String, override val logger: Logger) : ParserToolBridge() {
+ override val foundry = "malt"
+
+ val tagger: MaltParserService
+
+ init {
+ logger.info("Initializing MaltParser with model $model")
+ synchronized(MaltParserService::class.java) {
+ tagger = MaltParserService()
+ if (model.contains("/")) {
+ val dirName = model.substringBeforeLast("/")
+ val modelName = model.substringAfterLast("/")
+ logger.info("Loading model $modelName from $dirName")
+ tagger.initializeParserModel("-w $dirName -c $modelName -m parse")
+ } else {
+ tagger.initializeParserModel("-c $model -m parse")
+ }
+ logger.info("Model $model loaded")
+ }
+ }
+
+ @Throws(MaltChainedException::class)
+ override fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ ) {
+ val result = tagger.parse(sentenceTokens.toTypedArray())
+
+ (result as DependencyStructure).edges.forEach { edge ->
+ val from = edge.source.index
+ val head = edge.target.index
+ val label = edge.toString()
+ if (label.contains("DEPREL:")) {
+ val rel = edge.toString().substringAfter("DEPREL:")
+ val old = morpho?.get(sentenceTokenOffsets[head - 1])
+ morpho?.set(
+ sentenceTokenOffsets[head - 1], KorapXml2Conllu.MorphoSpan(
+ lemma = old?.lemma, xpos = old?.xpos, feats = old?.feats, head = from.toString(), deprel = rel
+ )
+ )
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MarmotBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MarmotBridge.kt
new file mode 100644
index 0000000..b9caa5e
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/MarmotBridge.kt
@@ -0,0 +1,37 @@
+package de.ids_mannheim.korapxmltools
+
+import marmot.morph.MorphTagger
+import marmot.morph.Sentence
+import marmot.morph.Word
+import marmot.util.FileUtils
+import java.util.logging.Logger
+
+class MarmotBridge(override val model: String, override val logger: Logger) : TaggerToolBridge() {
+ override val foundry = "marmot"
+ val tagger: MorphTagger
+
+ init {
+ logger.info("Initializing MarMoT with model $model")
+ tagger = FileUtils.loadFromFile(model)
+ //tagger.setMaxLevel(100)
+ logger.info("Model $model loaded")
+ }
+
+ @Throws(java.lang.ArrayIndexOutOfBoundsException::class, java.lang.Exception::class)
+ override fun tagSentence(
+ sentenceTokens: MutableList<String>,
+ sentenceTokenOffsets: MutableList<String>,
+ morphoMap: MutableMap<String, KorapXml2Conllu.MorphoSpan>?
+ ) {
+ val sentence = Sentence(sentenceTokens.map { Word(it) })
+ var result: List<List<String>>
+ result = tagger.tag(sentence) // LOGGER.info("Marmot tagger finished")// return
+ for (i in 0 until result.size) {
+ val taggedWord = KorapXml2Conllu.MorphoSpan(
+ xpos = result[i][0].split("|")[0], feats = result[i][1]
+ )
+ morphoMap?.set(sentenceTokenOffsets[i], taggedWord)
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
new file mode 100644
index 0000000..8b1fa8f
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/ParserToolBridge.kt
@@ -0,0 +1,46 @@
+package de.ids_mannheim.korapxmltools
+
+abstract class ParserToolBridge : AnnotationToolBridge {
+ fun parseText(
+ tokens: Array<KorapXml2Conllu.Span>,
+ morpho: MutableMap<String, KorapXml2Conllu.MorphoSpan>?,
+ sentenceSpans: Array<KorapXml2Conllu.Span>?,
+ text: String
+ ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+ val sentence_tokens = mutableListOf<String>()
+ val sentence_token_offsets = mutableListOf<String>()
+ var token_index = 1
+ var sentence_index = 0
+ tokens.forEach { span ->
+ if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+ tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+ sentence_tokens.clear()
+ sentence_token_offsets.clear()
+ sentence_index++
+ token_index = 1
+
+ }
+ sentence_tokens.add(
+ "$token_index\t${
+ text.substring(
+ span.from, span.to
+ )
+ }\t_\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${morpho?.get("${span.from}-${span.to}")?.xpos ?: "_"}\t${
+ morpho?.get(
+ "${span.from}-${span.to}"
+ )?.feats ?: "_"
+ }\t_\t_\t_\t_"
+ )
+ sentence_token_offsets.add("${span.from}-${span.to}")
+ token_index++
+ }
+ if (sentence_tokens.size > 0) {
+ try {
+ tagSentence(sentence_tokens, sentence_token_offsets, morpho)
+ } catch (e: ArrayIndexOutOfBoundsException) {
+ logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+ }
+ }
+ return morpho!!
+ }
+}
\ No newline at end of file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
new file mode 100644
index 0000000..03e04e8
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/TaggerToolBridge.kt
@@ -0,0 +1,35 @@
+package de.ids_mannheim.korapxmltools
+
+abstract class TaggerToolBridge : AnnotationToolBridge {
+
+ fun tagText(
+ tokens: Array<KorapXml2Conllu.Span>, sentenceSpans: Array<KorapXml2Conllu.Span>?, text: String
+ ): MutableMap<String, KorapXml2Conllu.MorphoSpan> {
+ val sentence_tokens = mutableListOf<String>()
+ val sentence_token_offsets = mutableListOf<String>()
+ val morphoMap = mutableMapOf<String, KorapXml2Conllu.MorphoSpan>()
+ var token_index = 0
+ var sentence_index = 0
+ tokens.forEach { span ->
+ if (span.from >= (sentenceSpans?.get(sentence_index)?.to ?: 11111110)) {
+ tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+ sentence_tokens.clear()
+ sentence_token_offsets.clear()
+ sentence_index++
+ token_index = 1
+
+ }
+ sentence_tokens.add(text.substring(span.from, span.to))
+ sentence_token_offsets.add("${span.from}-${span.to}")
+ token_index++
+ }
+ if (sentence_tokens.size > 0) {
+ try {
+ tagSentence(sentence_tokens, sentence_token_offsets, morphoMap)
+ } catch (e: ArrayIndexOutOfBoundsException) {
+ logger.warning("Tagging failed: ${e.message} ${e.stackTrace} ${sentence_tokens.joinToString { " " }}")
+ }
+ }
+ return morphoMap
+ }
+}
\ No newline at end of file