Process data, structure and constituency with DOM
Much faster for large texts
Change-Id: I9520b33ccc95d3b473c7b06eba06d3456845f9b6
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 82a27fd..6c06c7f 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1677,8 +1677,11 @@
val isStructure = zipEntry.name.endsWith("structure.xml")
val needsDom = isStructure && (extractAttributesRegex.isNotEmpty() || outputFormat == OutputFormat.KRILL)
val isConstituency = zipEntry.name.endsWith("constituency.xml")
+ val isData = zipEntry.name.endsWith("data.xml")
- if (!needsDom && !isConstituency) {
+ // Use DOM for data.xml (large text content) and structure/constituency (complex parsing)
+ // Use StAX for annotation files (morpho, dependency, tokens, sentences) for better performance
+ if (!needsDom && !isConstituency && !isData) {
processXmlEntryStax(zipFile, zipPath, zipEntry, foundry, waitForMorpho)
return
}
@@ -2792,6 +2795,7 @@
var currentSpan: MorphoSpan? = null
var currentFromTo: String? = null
var currentFName: String? = null
+ val textAccumulator = StringBuilder()
while (reader.hasNext()) {
val event = reader.next()
@@ -2814,6 +2818,7 @@
} catch (_: NumberFormatException) {}
}
} else if (localName == "f" && currentSpan != null) {
+ textAccumulator.clear()
currentFName = reader.getAttributeValue(null, "name")
} else if (localName == "symbol" && currentSpan != null && currentFName == "type") {
val value = reader.getAttributeValue(null, "value")?.trim()
@@ -2824,9 +2829,15 @@
}
XMLStreamConstants.CHARACTERS -> {
if (currentSpan != null && currentFName != null && !reader.isWhiteSpace) {
- val value = reader.text.trim()
+ textAccumulator.append(reader.text)
+ }
+ }
+ XMLStreamConstants.END_ELEMENT -> {
+ val localName = reader.localName
+ if (localName == "f" && currentSpan != null && currentFName != null) {
+ val value = textAccumulator.toString().trim()
if (value.isNotEmpty()) {
- when (currentFName) {
+ when (currentFName) {
"lemma" -> if(currentSpan.lemma == "_") currentSpan.lemma = value.replace(UNKNOWN, "--")
"upos" -> currentSpan.upos = value
"xpos", "ctag", "pos" -> if(currentSpan.xpos == "_") currentSpan.xpos = value.replace(UNKNOWN, "--")
@@ -2834,18 +2845,14 @@
"certainty" -> if(currentSpan.misc == "_") currentSpan.misc = value
}
}
- }
- }
- XMLStreamConstants.END_ELEMENT -> {
- val localName = reader.localName
- if (localName == "span") {
+ textAccumulator.clear()
+ currentFName = null
+ } else if (localName == "span") {
if (currentSpan != null && currentFromTo != null) {
res[currentFromTo] = currentSpan
}
currentSpan = null
currentFromTo = null
- } else if (localName == "f") {
- currentFName = null
}
}
}