Extract KorAP XML output to own module
Change-Id: I394cbd1b707b2760058f9b2eba91e327ba7b707a
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index ec770fa..a7edc9a 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -2,6 +2,7 @@
import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries
import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries
+import de.ids_mannheim.korapxmltools.formatters.KorapXmlFormatter
import org.apache.commons.compress.archivers.tar.TarArchiveEntry
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream
import org.apache.commons.compress.archivers.zip.Zip64Mode
@@ -1944,7 +1945,7 @@
LOGGER.finer("Constituency parsed text: $docId, generated ${trees.size} trees in thread ${Thread.currentThread().threadId()}")
}
if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) {
- korapXmlOutput(getMorphoFoundry(), docId)
+ formatKorapXmlOutput(getMorphoFoundry(), docId)
} else {
formatConlluOutput(foundry, docId)
}
@@ -1985,7 +1986,26 @@
var wroteOne = false
// Always write morpho.xml if we have morpho annotations (tagger or from input)
if (morpho[docId] != null && morpho[docId]!!.isNotEmpty()) {
- val morphoXml = korapXmlMorphoOutput(morphoDir, docId).toString()
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = morphoDir,
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[docId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+ val morphoXml = KorapXmlFormatter.formatMorpho(context, dBuilder!!).toString()
val morphoPath = docId.replace(Regex("[_.]"), "/") + "/$morphoDir/morpho.xml"
val morphoEntry = ZipArchiveEntry(morphoPath)
morphoEntry.unixMode = ZIP_ENTRY_UNIX_MODE
@@ -1998,7 +2018,26 @@
}
// Write dependency.xml if a parser is active and dependency info present
if (parserToolBridges[Thread.currentThread().threadId()] != null) {
- val depXml = korapXmlDependencyOutput(depDir, docId).toString()
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = depDir,
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[docId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+ val depXml = KorapXmlFormatter.formatDependency(context, dBuilder!!).toString()
val depPath = docId.replace(Regex("[_.]"), "/") + "/$depDir/dependency.xml"
val depEntry = ZipArchiveEntry(depPath)
depEntry.unixMode = ZIP_ENTRY_UNIX_MODE
@@ -2012,7 +2051,26 @@
// Write constituency.xml if a constituency parser is active
if (constituencyParserBridges[Thread.currentThread().threadId()] != null && constituencyTrees[docId] != null) {
val constDir = constituencyParserBridges[Thread.currentThread().threadId()]!!.foundry
- val constXml = korapXmlConstituencyOutput(constDir, docId).toString()
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = constDir,
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[docId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+ val constXml = KorapXmlFormatter.formatConstituency(context, dBuilder!!).toString()
val constPath = docId.replace(Regex("[_.]"), "/") + "/$constDir/constituency.xml"
val constEntry = ZipArchiveEntry(constPath)
constEntry.unixMode = ZIP_ENTRY_UNIX_MODE
@@ -2086,244 +2144,6 @@
}
}
- private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {
- val doc: Document = dBuilder!!.newDocument()
-
- // Root element
- val layer = doc.createElement("layer")
- layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
- layer.setAttribute("version", "KorAP-0.4")
- layer.setAttribute("docid", docId)
- doc.appendChild(layer)
-
- val spanList = doc.createElement("spanList")
- layer.appendChild(spanList)
-
- var i = 0
- var s = 0
- var n = 0
- val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split("-")[0].toInt() }
-
- sortedKeys?.forEach { spanString ->
- val mfs = morpho[docId]?.get(spanString)
- val offsets = spanString.split("-")
- if(offsets.size != 2) {
- LOGGER.warning("Invalid span: $spanString in $docId")
- return@forEach
- }
- if (offsets[0].toInt() > sentences[docId]!!.elementAt(s).to) {
- s++
- n = i
- }
- i++
- if (mfs!!.deprel == "_") {
- return@forEach
- }
-
- val spanNode = doc.createElement("span")
- spanNode.setAttribute("id", "s${s + 1}_n${i - n}")
- spanNode.setAttribute("from", offsets[0])
- spanNode.setAttribute("to", offsets[1])
-
- // rel element
- val rel = doc.createElement("rel")
- rel.setAttribute("label", mfs.deprel)
-
- // inner span element
- val innerSpan = doc.createElement("span")
- val headInt = if(mfs.head == "_") 0 else parseInt(mfs.head) - 1
- if (headInt < 0) {
- innerSpan.setAttribute("from", sentences[docId]!!.elementAt(s).from.toString())
- innerSpan.setAttribute("to", sentences[docId]!!.elementAt(s).to.toString())
- } else {
- if (headInt + n >= morpho[docId]!!.size) {
- LOGGER.warning("Head index out of bounds: ${headInt+n} >= ${morpho[docId]!!.size} in $docId")
- return@forEach
- } else {
- val destSpanString = sortedKeys.elementAt(headInt + n)
- val destOffsets = destSpanString.split("-")
- innerSpan.setAttribute("from", destOffsets[0])
- innerSpan.setAttribute("to", destOffsets[1])
- }
- }
- rel.appendChild(innerSpan)
- spanNode.appendChild(rel)
- spanList.appendChild(spanNode)
- }
- val transformerFactory = TransformerFactory.newInstance()
- val transformer = transformerFactory.newTransformer()
- transformer.setOutputProperty(OutputKeys.INDENT, "yes")
- transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
- transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1")
- val domSource = DOMSource(doc)
- val streamResult = StreamResult(StringWriter())
- transformer.transform(domSource, streamResult)
-
- return StringBuilder(streamResult.writer.toString())
- }
-
- private fun korapXmlConstituencyOutput(foundry: String, docId: String): StringBuilder {
- val doc: Document = dBuilder!!.newDocument()
-
- // Root element
- val layer = doc.createElement("layer")
- layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
- layer.setAttribute("version", "KorAP-0.4")
- layer.setAttribute("docid", docId)
- doc.appendChild(layer)
-
- val spanList = doc.createElement("spanList")
- layer.appendChild(spanList)
-
- val trees = constituencyTrees[docId]
- if (trees == null || trees.isEmpty()) {
- LOGGER.warning("No constituency trees found for $docId")
- return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
- }
-
- // Process each tree
- trees.forEach { tree ->
- tree.nodes.forEach { node ->
- // Create span element
- val spanNode = doc.createElement("span")
- spanNode.setAttribute("id", node.id)
- spanNode.setAttribute("from", node.from.toString())
- spanNode.setAttribute("to", node.to.toString())
-
- // Create fs element for the constituency label
- val fs = doc.createElement("fs")
- fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
- fs.setAttribute("type", "node")
-
- val f = doc.createElement("f")
- f.setAttribute("name", "const")
- f.textContent = node.label
- fs.appendChild(f)
-
- spanNode.appendChild(fs)
-
- // Add rel elements for children
- node.children.forEach { child ->
- val rel = doc.createElement("rel")
- rel.setAttribute("label", "dominates")
-
- when (child) {
- is ConstituencyParserBridge.ConstituencyChild.NodeRef -> {
- rel.setAttribute("target", child.targetId)
- }
- is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> {
- rel.setAttribute("uri", "morpho.xml#${child.morphoId}")
- }
- }
-
- spanNode.appendChild(rel)
- }
-
- spanList.appendChild(spanNode)
- }
- }
-
- val transformerFactory = TransformerFactory.newInstance()
- val transformer = transformerFactory.newTransformer()
- transformer.setOutputProperty(OutputKeys.INDENT, "yes")
- transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
- transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3")
- val domSource = DOMSource(doc)
- val streamResult = StreamResult(StringWriter())
- transformer.transform(domSource, streamResult)
-
- return StringBuilder(streamResult.writer.toString())
- }
-
- private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {
- return if (parserName != null) {
- korapXmlDependencyOutput(foundry, docId)
- } else {
- korapXmlMorphoOutput(foundry, docId)
- }
- }
-
- private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {
- val doc: Document = dBuilder!!.newDocument()
-
- // Root element
- val layer = doc.createElement("layer")
- layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
- layer.setAttribute("version", "KorAP-0.4")
- layer.setAttribute("docid", docId)
- doc.appendChild(layer)
-
- val spanList = doc.createElement("spanList")
- layer.appendChild(spanList)
-
- var i = 0
- morpho[docId]?.forEach { (spanString, mfs) ->
- i++
- val offsets = spanString.split("-")
- val spanNode = doc.createElement("span")
- spanNode.setAttribute("id", "t_$i")
- spanNode.setAttribute("from", offsets[0])
- spanNode.setAttribute("to", offsets[1])
-
- // fs element
- val fs = doc.createElement("fs")
- fs.setAttribute("type", "lex")
- fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
- spanNode.appendChild(fs)
- val f = doc.createElement("f")
- f.setAttribute("name", "lex")
- fs.appendChild(f)
-
- // Inner fs element
- val innerFs = doc.createElement("fs")
- f.appendChild(innerFs)
-
- if (mfs.lemma != "_") {
- val innerF = doc.createElement("f")
- innerF.setAttribute("name", "lemma")
- innerF.textContent = mfs.lemma
- innerFs.appendChild(innerF)
- }
- if (mfs.upos != "_") {
- val innerF = doc.createElement("f")
- innerF.setAttribute("name", "upos")
- innerF.textContent = mfs.upos
- innerFs.appendChild(innerF)
- }
- if (mfs.xpos != "_") {
- val innerF = doc.createElement("f")
- innerF.setAttribute("name", "pos")
- innerF.textContent = mfs.xpos
- innerFs.appendChild(innerF)
- }
- if (mfs.feats != "_") {
- val innerF = doc.createElement("f")
- innerF.setAttribute("name", "msd")
- innerF.textContent = mfs.feats
- innerFs.appendChild(innerF)
- }
- if (mfs.misc != "_" && mfs.misc!!.matches(Regex("^[0-9.]+$"))) {
- val innerF = doc.createElement("f")
- innerF.setAttribute("name", "certainty")
- innerF.textContent = mfs.misc
- innerFs.appendChild(innerF)
- }
-
- spanList.appendChild(spanNode)
- }
- val transformerFactory = TransformerFactory.newInstance()
- val transformer = transformerFactory.newTransformer()
- transformer.setOutputProperty(OutputKeys.INDENT, "yes")
- transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
- transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1")
- val domSource = DOMSource(doc)
- val streamResult = StreamResult(StringWriter())
- transformer.transform(domSource, streamResult)
-
- return StringBuilder(streamResult.writer.toString())
-
- }
-
private fun conlluOutput(foundry: String, docId: String): StringBuilder {
var token_index = 0
var real_token_index = 0
@@ -2621,6 +2441,33 @@
return de.ids_mannheim.korapxmltools.formatters.NowFormatter.format(context)
}
+ private fun formatKorapXmlOutput(foundry: String, docId: String): StringBuilder {
+ val hasConstituencyParser = constituencyParserBridges[Thread.currentThread().threadId()] != null
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = foundry,
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[docId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator,
+ documentBuilder = dBuilder,
+ parserName = parserName,
+ constituencyParserName = if (hasConstituencyParser) "constituency" else null
+ )
+ return de.ids_mannheim.korapxmltools.formatters.KorapXmlFormatter.format(context)
+ }
+
private fun printConlluToken(
token_index: Int,
token: String,
@@ -2935,7 +2782,26 @@
}
try {
- val morphoXmlOutput = korapXmlMorphoOutput(foundry, tempDocId)
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = tempDocId,
+ foundry = foundry,
+ tokens = tokens[tempDocId],
+ sentences = sentences[tempDocId],
+ text = texts[tempDocId],
+ morpho = morpho[tempDocId],
+ metadata = metadata[tempDocId],
+ extraFeatures = extraFeatures[tempDocId],
+ fileName = fnames[tempDocId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[tempDocId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+ val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
val fixedMorphoXml = morphoXmlOutput.toString().replace(
"docid=\"$tempDocId\"",
"docid=\"$docId\""
@@ -2959,7 +2825,26 @@
if (morpho[tempDocId]?.values?.any { it.head != null && it.head != "_" && it.deprel != null && it.deprel != "_" } == true) {
try {
- val dependencyXmlOutput = korapXmlDependencyOutput(foundry, tempDocId)
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = tempDocId,
+ foundry = foundry,
+ tokens = tokens[tempDocId],
+ sentences = sentences[tempDocId],
+ text = texts[tempDocId],
+ morpho = morpho[tempDocId],
+ metadata = metadata[tempDocId],
+ extraFeatures = extraFeatures[tempDocId],
+ fileName = fnames[tempDocId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ extractAttributesRegex = extractAttributesRegex,
+ columns = columns,
+ constituencyTrees = constituencyTrees[tempDocId],
+ includeOffsetsInMisc = false,
+ compatibilityMode = COMPATIBILITY_MODE,
+ tokenSeparator = tokenSeparator
+ )
+ val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
val fixedDependencyXml = dependencyXmlOutput.toString().replace(
"docid=\"$tempDocId\"",
"docid=\"$docId\""
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
new file mode 100644
index 0000000..e7f8631
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
@@ -0,0 +1,265 @@
+package de.ids_mannheim.korapxmltools.formatters
+
+import de.ids_mannheim.korapxmltools.ConstituencyParserBridge
+import org.w3c.dom.Document
+import java.io.StringWriter
+import java.util.logging.Logger
+import javax.xml.parsers.DocumentBuilder
+import javax.xml.transform.OutputKeys
+import javax.xml.transform.TransformerFactory
+import javax.xml.transform.dom.DOMSource
+import javax.xml.transform.stream.StreamResult
+
+/**
+ * Formatter for KorAP-XML output format.
+ * Generates XML layers with morphological, dependency, or constituency annotations.
+ */
+object KorapXmlFormatter : OutputFormatter {
+ private val LOGGER = Logger.getLogger(KorapXmlFormatter::class.java.name)
+
+ override val formatName: String = "korapxml"
+
+ override fun format(context: OutputContext): StringBuilder {
+ // Requires a DocumentBuilder to be passed
+ val dBuilder = context.documentBuilder
+ ?: throw IllegalArgumentException("DocumentBuilder required for KorAP-XML output")
+
+ val parserName = context.parserName
+ val constituencyParserName = context.constituencyParserName
+
+ return when {
+ constituencyParserName != null -> formatConstituency(context, dBuilder)
+ parserName != null -> formatDependency(context, dBuilder)
+ else -> formatMorpho(context, dBuilder)
+ }
+ }
+
+ /**
+ * Format morphological annotations as KorAP-XML.
+ */
+ fun formatMorpho(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ val doc: Document = dBuilder.newDocument()
+
+ // Root element
+ val layer = doc.createElement("layer")
+ layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+ layer.setAttribute("version", "KorAP-0.4")
+ layer.setAttribute("docid", context.docId)
+ doc.appendChild(layer)
+
+ val spanList = doc.createElement("spanList")
+ layer.appendChild(spanList)
+
+ var i = 0
+ context.morpho?.forEach { (spanString, mfs) ->
+ i++
+ val offsets = spanString.split("-")
+ val spanNode = doc.createElement("span")
+ spanNode.setAttribute("id", "t_$i")
+ spanNode.setAttribute("from", offsets[0])
+ spanNode.setAttribute("to", offsets[1])
+
+ // fs element
+ val fs = doc.createElement("fs")
+ fs.setAttribute("type", "lex")
+ fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
+ spanNode.appendChild(fs)
+ val f = doc.createElement("f")
+ f.setAttribute("name", "lex")
+ fs.appendChild(f)
+
+ // Inner fs element
+ val innerFs = doc.createElement("fs")
+ f.appendChild(innerFs)
+
+ if (mfs.lemma != "_") {
+ val innerF = doc.createElement("f")
+ innerF.setAttribute("name", "lemma")
+ innerF.textContent = mfs.lemma
+ innerFs.appendChild(innerF)
+ }
+ if (mfs.upos != "_") {
+ val innerF = doc.createElement("f")
+ innerF.setAttribute("name", "upos")
+ innerF.textContent = mfs.upos
+ innerFs.appendChild(innerF)
+ }
+ if (mfs.xpos != "_") {
+ val innerF = doc.createElement("f")
+ innerF.setAttribute("name", "pos")
+ innerF.textContent = mfs.xpos
+ innerFs.appendChild(innerF)
+ }
+ if (mfs.feats != "_") {
+ val innerF = doc.createElement("f")
+ innerF.setAttribute("name", "msd")
+ innerF.textContent = mfs.feats
+ innerFs.appendChild(innerF)
+ }
+ if (mfs.misc != "_" && mfs.misc!!.matches(Regex("^[0-9.]+$"))) {
+ val innerF = doc.createElement("f")
+ innerF.setAttribute("name", "certainty")
+ innerF.textContent = mfs.misc
+ innerFs.appendChild(innerF)
+ }
+
+ spanList.appendChild(spanNode)
+ }
+
+ return transformToString(doc)
+ }
+
+ /**
+ * Format dependency annotations as KorAP-XML.
+ */
+ fun formatDependency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ val doc: Document = dBuilder.newDocument()
+
+ // Root element
+ val layer = doc.createElement("layer")
+ layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+ layer.setAttribute("version", "KorAP-0.4")
+ layer.setAttribute("docid", context.docId)
+ doc.appendChild(layer)
+
+ val spanList = doc.createElement("spanList")
+ layer.appendChild(spanList)
+
+ var i = 0
+ var s = 0
+ var n = 0
+ val sortedKeys = context.morpho?.keys?.sortedBy { it.split("-")[0].toInt() }
+
+ sortedKeys?.forEach { spanString ->
+ val mfs = context.morpho?.get(spanString)
+ val offsets = spanString.split("-")
+ if(offsets.size != 2) {
+ LOGGER.warning("Invalid span: $spanString in ${context.docId}")
+ return@forEach
+ }
+ if (offsets[0].toInt() > context.sentences!!.elementAt(s).to) {
+ s++
+ n = i
+ }
+ i++
+ if (mfs!!.deprel == "_") {
+ return@forEach
+ }
+
+ val spanNode = doc.createElement("span")
+ spanNode.setAttribute("id", "s${s + 1}_n${i - n}")
+ spanNode.setAttribute("from", offsets[0])
+ spanNode.setAttribute("to", offsets[1])
+
+ // rel element
+ val rel = doc.createElement("rel")
+ rel.setAttribute("label", mfs.deprel)
+
+ // inner span element
+ val innerSpan = doc.createElement("span")
+ val headInt = if(mfs.head == "_") 0 else Integer.parseInt(mfs.head) - 1
+ if (headInt < 0) {
+ innerSpan.setAttribute("from", context.sentences.elementAt(s).from.toString())
+ innerSpan.setAttribute("to", context.sentences.elementAt(s).to.toString())
+ } else {
+ if (headInt + n >= context.morpho.size) {
+ LOGGER.warning("Head index out of bounds: ${headInt+n} >= ${context.morpho.size} in ${context.docId}")
+ return@forEach
+ } else {
+ val destSpanString = sortedKeys.elementAt(headInt + n)
+ val destOffsets = destSpanString.split("-")
+ innerSpan.setAttribute("from", destOffsets[0])
+ innerSpan.setAttribute("to", destOffsets[1])
+ }
+ }
+ rel.appendChild(innerSpan)
+ spanNode.appendChild(rel)
+ spanList.appendChild(spanNode)
+ }
+
+ return transformToString(doc)
+ }
+
+ /**
+ * Format constituency annotations as KorAP-XML.
+ */
+ fun formatConstituency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ val doc: Document = dBuilder.newDocument()
+
+ // Root element
+ val layer = doc.createElement("layer")
+ layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP")
+ layer.setAttribute("version", "KorAP-0.4")
+ layer.setAttribute("docid", context.docId)
+ doc.appendChild(layer)
+
+ val spanList = doc.createElement("spanList")
+ layer.appendChild(spanList)
+
+ val trees = context.constituencyTrees
+ if (trees == null || trees.isEmpty()) {
+ LOGGER.warning("No constituency trees found for ${context.docId}")
+ return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+ }
+
+ // Process each tree
+ trees.forEach { tree ->
+ tree.nodes.forEach { node ->
+ // Create span element
+ val spanNode = doc.createElement("span")
+ spanNode.setAttribute("id", node.id)
+ spanNode.setAttribute("from", node.from.toString())
+ spanNode.setAttribute("to", node.to.toString())
+
+ // Create fs element for the constituency label
+ val fs = doc.createElement("fs")
+ fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0")
+ fs.setAttribute("type", "node")
+
+ val f = doc.createElement("f")
+ f.setAttribute("name", "const")
+ f.textContent = node.label
+ fs.appendChild(f)
+
+ spanNode.appendChild(fs)
+
+ // Add rel elements for children
+ node.children.forEach { child ->
+ val rel = doc.createElement("rel")
+ rel.setAttribute("label", "dominates")
+
+ when (child) {
+ is ConstituencyParserBridge.ConstituencyChild.NodeRef -> {
+ rel.setAttribute("target", child.targetId)
+ }
+ is ConstituencyParserBridge.ConstituencyChild.MorphoRef -> {
+ rel.setAttribute("uri", "morpho.xml#${child.morphoId}")
+ }
+ }
+
+ spanNode.appendChild(rel)
+ }
+
+ spanList.appendChild(spanNode)
+ }
+ }
+
+ return transformToString(doc, indentAmount = "3")
+ }
+
+ /**
+ * Transform DOM document to formatted XML string.
+ */
+ private fun transformToString(doc: Document, indentAmount: String = "1"): StringBuilder {
+ val transformerFactory = TransformerFactory.newInstance()
+ val transformer = transformerFactory.newTransformer()
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", indentAmount)
+ val domSource = DOMSource(doc)
+ val streamResult = StreamResult(StringWriter())
+ transformer.transform(domSource, streamResult)
+
+ return StringBuilder(streamResult.writer.toString())
+ }
+}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt
index 7a28e73..9d6fc14 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt
@@ -26,7 +26,11 @@
val constituencyTrees: List<ConstituencyParserBridge.ConstituencyTree>? = null,
val includeOffsetsInMisc: Boolean = false,
val compatibilityMode: Boolean = false,
- val tokenSeparator: String = "\n"
+ val tokenSeparator: String = "\n",
+ // KorAP-XML specific fields
+ val documentBuilder: javax.xml.parsers.DocumentBuilder? = null,
+ val parserName: String? = null,
+ val constituencyParserName: String? = null
)
/**
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt
index c45883e..3d1f536 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt
@@ -1,9 +1,9 @@
package de.ids_mannheim.korapxmltools.formatters
/**
- * Formatter for Word2Vec / language model training output format.
- * Outputs tokens separated by spaces, sentences separated by newlines.
- * Can use lemmas instead of surface forms when available.
+ * Formatter for Word2Vec / language model training output.
+ * Outputs tokens in lemmatized form (or word form if no lemma), space-separated,
+ * with sentences separated by newlines.
*/
object Word2VecFormatter : OutputFormatter {
override val formatName: String = "word2vec"