Add KorAP XML zip output for dependency parses
Change-Id: I6ff101c8145c25cc318a82c6ad8dc311825a7ad1
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 43a67f0..3e6738c 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -9,7 +9,6 @@
import org.xml.sax.SAXParseException
import picocli.CommandLine
import picocli.CommandLine.*
-import java.io.ByteArrayOutputStream
import java.io.File
import java.io.FileOutputStream
import java.io.InputStream
@@ -335,6 +334,7 @@
private fun processZipFile(zipFilePath: String, foundry: String = "base") {
LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
+ LOGGER.info("Foundry: $foundry $dbFactory")
if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) {
var targetFoundry = "base"
if (taggerName != null) {
@@ -343,13 +343,15 @@
targetFoundry = tagger.foundry
}
} else {
- LOGGER.severe("KorAP-XML output currently only supports morphosyntactic annotations. Use CoNLL-U (default) output format instead, and pipe through conllu2korapxml.")
- exitProcess(1)
+ targetFoundry = parserName!!
}
dbFactory = DocumentBuilderFactory.newInstance()
dBuilder = dbFactory!!.newDocumentBuilder()
val outputMorphoZipFileName =
- zipFilePath.replace(Regex("\\.zip$"), ".".plus(targetFoundry).plus(".zip"))
+ if (parserName != null)
+ zipFilePath.replace(Regex("(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$"), ".".plus(parserName).plus(".zip"))
+ else
+ zipFilePath.replace(Regex("\\.zip$"), ".".plus(targetFoundry).plus(".zip"))
if (File(outputMorphoZipFileName).exists() && !overwrite) {
LOGGER.severe("Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.")
exitProcess(1)
@@ -420,6 +422,7 @@
parserToolBridges[Thread.currentThread().id] = parser
if (parser != null) {
foundry = "$foundry dependency:${parser.foundry}"
+ LOGGER.fine("Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().id}")
}
}
@@ -480,6 +483,7 @@
&& (!waitForMorpho || morpho[docId] != null)
&& (extractMetadataRegex.isEmpty() || metadata[docId] != null)
) {
+ LOGGER.info("Processing text: $docId in thread ${Thread.currentThread().id}")
processText(docId, foundry)
}
} else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
@@ -526,14 +530,21 @@
sentences[docId],
texts[docId]!!
)
- if (parserToolBridges[Thread.currentThread().id] != null) {
- morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(
- tokens[docId]!!,
- morpho[docId],
- sentences[docId],
- texts[docId]!!
- )
+
+ }
+ if (parserToolBridges[Thread.currentThread().id] != null) {
+ if (morpho[docId] == null) {
+ LOGGER.severe("No morpho data for $docId")
+ //exitProcess(1)
}
+ LOGGER.finer("Parsing text: $docId in thread ${Thread.currentThread().id}")
+ morpho[docId] = parserToolBridges[Thread.currentThread().id]!!.parseText(
+ tokens[docId]!!,
+ morpho[docId],
+ sentences[docId],
+ texts[docId]!!
+ )
+ LOGGER.finer("Parsed text: $docId in thread ${Thread.currentThread().id}")
}
if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) {
korapXmlOutput(getMorphoFoundry(), docId)
@@ -558,7 +569,9 @@
}
if (outputFormat == OutputFormat.KORAPXML) {
- val entryPath = docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml")
+ val entryPath = if (parserName != null) docId.replace(Regex("[_.]"), "/").plus("/$parserName/").plus("dependency.xml")
+ else
+ docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml")
val zipEntry = ZipEntry(entryPath)
// val zipEntry = org.apache.tools.zip.ZipEntry(entryPath)
// zipEntry.unixMode = 65535
@@ -573,7 +586,7 @@
private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().id]?.foundry ?: "base"
- private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {
+ private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {
val doc: Document = dBuilder!!.newDocument()
// Root element
@@ -587,6 +600,86 @@
layer.appendChild(spanList)
var i = 0
+ var s = 0
+ var n = 0
+ val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split("-")[0].toInt() }
+
+ sortedKeys?.forEach { spanString ->
+ val mfs = morpho[docId]?.get(spanString)
+ val offsets = spanString.split("-")
+ if (offsets[0].toInt() > sentences[docId]!!.elementAt(s).to) {
+ s++
+ n = i
+ }
+ i++
+ if (mfs!!.deprel == "_") {
+ return@forEach
+ }
+
+ val spanNode = doc.createElement("span")
+ spanNode.setAttribute("id", "s${s + 1}_n${i - n}")
+ spanNode.setAttribute("from", offsets[0])
+ spanNode.setAttribute("to", offsets[1])
+
+ // rel element
+ val rel = doc.createElement("rel")
+ rel.setAttribute("label", mfs.deprel)
+
+ // inner span element
+ val innerSpan = doc.createElement("span")
+ val headInt = if(mfs.head == "_") 0 else parseInt(mfs.head) - 1
+ if (headInt < 0) {
+ innerSpan.setAttribute("from", sentences[docId]!!.elementAt(s).from.toString())
+ innerSpan.setAttribute("to", sentences[docId]!!.elementAt(s).to.toString())
+ } else {
+ if (headInt + n >= morpho[docId]!!.size) {
+ LOGGER.warning("Head index out of bounds: ${headInt+n} >= ${morpho[docId]!!.size} in $docId")
+ return@forEach
+ } else {
+ val destSpanString = sortedKeys.elementAt(headInt + n)
+ val destOffsets = destSpanString.split("-")
+ innerSpan.setAttribute("from", destOffsets[0])
+ innerSpan.setAttribute("to", destOffsets[1])
+ }
+ }
+ rel.appendChild(innerSpan)
+ spanNode.appendChild(rel)
+ spanList.appendChild(spanNode)
+ }
+ val transformerFactory = TransformerFactory.newInstance()
+ val transformer = transformerFactory.newTransformer()
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1")
+ val domSource = DOMSource(doc)
+ val streamResult = StreamResult(StringWriter())
+ transformer.transform(domSource, streamResult)
+
+ return StringBuilder(streamResult.writer.toString())
+ }
+
+ private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {
+ return if (parserName != null) {
+ korapXmlDependencyOutput(foundry, docId)
+ } else {
+ korapXmlMorphoOutput(foundry, docId)
+ }
+ }
+
+ private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {
+ val doc: Document = dBuilder!!.newDocument()
+
+ // Root element
+ val layer = doc.createElement("layer")
+ layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+ layer.setAttribute("version", "KorAP-0.4")
+ layer.setAttribute("docid", docId)
+ doc.appendChild(layer)
+
+ val spanList = doc.createElement("spanList")
+ layer.appendChild(spanList)
+
+ var i = 0
morpho[docId]?.forEach { (spanString, mfs) ->
i++
val offsets = spanString.split("-")