Switch to StAX XML parser
Change-Id: I1eebc6d678f38c24fd04e9b88bf572eb6a3e0c42
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 01cdff5..82a27fd 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -51,6 +51,9 @@
import javax.xml.transform.stream.StreamResult
import kotlin.math.min
import kotlin.system.exitProcess
+import javax.xml.stream.XMLInputFactory
+import javax.xml.stream.XMLStreamConstants
+import javax.xml.stream.XMLStreamReader
val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8)
@@ -688,6 +691,17 @@
} catch (_: Exception) {}
}
+ // Thread-local XMLInputFactory for StAX parsing
+ private val xmlInputFactory: ThreadLocal<XMLInputFactory> = ThreadLocal.withInitial {
+ XMLInputFactory.newInstance().apply {
+ setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false)
+ setProperty(XMLInputFactory.IS_VALIDATING, false)
+ setProperty(XMLInputFactory.SUPPORT_DTD, false)
+ setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, false)
+ setProperty(XMLInputFactory.IS_COALESCING, false)
+ }
+ }
+
// Data class to hold compressed Krill JSON ready for TAR writing
data class CompressedKrillData(
val textId: String,
@@ -1660,7 +1674,16 @@
try {
if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$"))) {
- LOGGER.finer("Processing entry: ${zipEntry.name}, foundry=$foundry")
+ val isStructure = zipEntry.name.endsWith("structure.xml")
+ val needsDom = isStructure && (extractAttributesRegex.isNotEmpty() || outputFormat == OutputFormat.KRILL)
+ val isConstituency = zipEntry.name.endsWith("constituency.xml")
+
+ if (!needsDom && !isConstituency) {
+ processXmlEntryStax(zipFile, zipPath, zipEntry, foundry, waitForMorpho)
+ return
+ }
+
+ LOGGER.finer("Processing entry (DOM): ${zipEntry.name}, foundry=$foundry")
// Use thread-local DocumentBuilder (reused, much faster than creating new ones)
val dBuilder: DocumentBuilder = threadLocalBuilder.get()
// Reset the builder state to avoid memory leaks
@@ -1919,6 +1942,145 @@
}
}
+ private fun processXmlEntryStax(zipFile: ApacheZipFile, zipPath: String, zipEntry: ZipArchiveEntry, foundry: String, waitForMorpho: Boolean) {
+ LOGGER.finer("Processing entry (StAX): ${zipEntry.name}, foundry=$foundry")
+ val factory = xmlInputFactory.get()
+ val inputStream = zipFile.getInputStream(zipEntry)
+ val filterReader = XMLCommentFilterReader(inputStream, "UTF-8")
+ val reader = try {
+ factory.createXMLStreamReader(filterReader)
+ } catch (e: Exception) {
+ LOGGER.warning("Error creating StAX reader: " + zipEntry.name + " " + e.message)
+ filterReader.close()
+ return
+ }
+
+ try {
+ var docId: String? = null
+ while (reader.hasNext()) {
+ val event = reader.next()
+ if (event == XMLStreamConstants.START_ELEMENT) {
+ docId = reader.getAttributeValue(null, "docid")
+ break
+ }
+ }
+
+ if (docId == null) return
+ if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) return
+
+ val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
+
+ when (fileName) {
+ "data.xml" -> {
+ if (!lemmaOnly) {
+ val text = extractTextStax(reader)
+ if (text != null) texts[docId] = NonBmpString(text)
+ }
+ }
+ "tokens.xml" -> {
+ if (!fnames.contains(docId)) fnames[docId] = zipEntry.name
+ tokens[docId] = extractSpansStax(reader, docId)
+ if (outputFormat == OutputFormat.KRILL && foundry == "base") {
+ collectKrillBaseData(docId)
+ }
+ }
+ "morpho.xml" -> {
+ fnames[docId] = zipEntry.name
+ val (morphoSpans, allSpans) = extractMorphoSpansStax(reader)
+
+ if (outputFormat == OutputFormat.KRILL) {
+ val morphoFoundry = getFoundryForLayer(foundry, "morpho")
+ collectKrillMorphoDataDirect(docId, morphoFoundry, morphoSpans, "morpho")
+ tokens[docId] = allSpans
+ } else {
+ val morphoMap = synchronized(morpho) {
+ morpho.getOrPut(docId) { morphoSpans }
+ }
+ if (morphoMap !== morphoSpans) {
+ synchronized(morphoMap) {
+ morphoSpans.forEach { (key, mfs) ->
+ val existing = morphoMap[key]
+ if (existing != null) {
+ mfs.head = existing.head
+ mfs.deprel = existing.deprel
+ }
+ morphoMap[key] = mfs
+ }
+ }
+ }
+ tokens[docId] = allSpans
+ }
+ }
+ "dependency.xml" -> {
+ val depMap = extractDependencySpansStax(reader)
+ if (outputFormat == OutputFormat.KRILL) {
+ val depFoundry = getFoundryForLayer(foundry, "dependency")
+ collectKrillMorphoDataDirect(docId, depFoundry, depMap, "dependency")
+ } else {
+ val morphoMap = synchronized(morpho) {
+ morpho.getOrPut(docId) { mutableMapOf() }
+ }
+ synchronized(morphoMap) {
+ depMap.forEach { (key, depSpan) ->
+ val existing = morphoMap[key]
+ if (existing != null) {
+ existing.head = depSpan.head
+ existing.deprel = depSpan.deprel
+ } else {
+ morphoMap[key] = depSpan
+ }
+ }
+ }
+ }
+ }
+ "sentences.xml" -> {
+ if (outputFormat == OutputFormat.KRILL) {
+ val spans = extractSpansStax(reader, docId)
+ collectSentencesFromSpans(docId, foundry, spans)
+ }
+ }
+ "structure.xml" -> {
+ sentences[docId] = extractSentenceSpansStax(reader)
+ }
+ }
+
+ if (outputFormat == OutputFormat.KRILL) {
+ processedTextsPerZip.getOrPut(zipPath.toString()) { mutableSetOf() }.add(docId)
+ }
+
+ val effectiveWaitForMorpho = if (fileName == "morpho.xml") true else waitForMorpho
+
+ val finalMorphoRequired = when {
+ taggerName != null || parserName != null -> false
+ useLemma -> true
+ effectiveWaitForMorpho -> true
+ outputFormat == OutputFormat.KORAP_XML && annotationWorkerPool == null -> true
+ outputFormat == OutputFormat.KRILL -> false
+ else -> false
+ }
+
+ val textRequired = when (outputFormat) {
+ OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly)
+ OutputFormat.KRILL -> true
+ else -> true
+ }
+
+ if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null
+ && (!finalMorphoRequired || morpho[docId] != null)
+ && (extractMetadataRegex.isEmpty() || metadata[docId] != null)
+ ) {
+ processText(docId, foundry)
+ }
+
+ } catch (e: Exception) {
+ LOGGER.warning("Error processing StAX entry ${zipEntry.name}: ${e.message}")
+ e.printStackTrace()
+ } finally {
+ try { reader.close() } catch (_: Exception) {}
+ try { filterReader.close() } catch (_: Exception) {}
+ }
+ }
+
private fun detectFoundryFromAnnotateCmd(cmd: String): String {
val lower = cmd.lowercase(Locale.getDefault())
return when {
@@ -2583,6 +2745,114 @@
return list.toTypedArray()
}
+ private fun extractSpansStax(reader: XMLStreamReader, docId: String): Array<Span> {
+ val list = ArrayList<Span>()
+ try {
+ while (reader.hasNext()) {
+ val event = reader.next()
+ if (event == XMLStreamConstants.START_ELEMENT && reader.localName == "span") {
+ val fromAttr = reader.getAttributeValue(null, "from")
+ val toAttr = reader.getAttributeValue(null, "to")
+ if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {
+ LOGGER.warning("[$docId] Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'")
+ } else {
+ try {
+ val from = Integer.parseInt(fromAttr)
+ val to = Integer.parseInt(toAttr)
+ list.add(Span(from, to))
+ } catch (e: NumberFormatException) {
+ LOGGER.warning("[$docId] Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}")
+ }
+ }
+ }
+ }
+ } catch (e: Exception) {
+ LOGGER.warning("Error parsing spans for $docId: ${e.message}")
+ }
+ return list.toTypedArray()
+ }
+
+ private fun extractTextStax(reader: XMLStreamReader): String? {
+ val textBuilder = StringBuilder()
+ while (reader.hasNext()) {
+ val event = reader.next()
+ if (event == XMLStreamConstants.CHARACTERS) {
+ textBuilder.append(reader.text)
+ } else if (event == XMLStreamConstants.END_ELEMENT && reader.localName == "raw_text") {
+ break
+ }
+ }
+ return if (textBuilder.isNotEmpty()) textBuilder.toString() else null
+ }
+
+ private fun extractMorphoSpansStax(reader: XMLStreamReader): Pair<MutableMap<String, MorphoSpan>, Array<Span>> {
+ val UNKNOWN = Regex("(UNKNOWN|<unknown>)")
+ val res: MutableMap<String, MorphoSpan> = HashMap()
+ val allSpans = ArrayList<Span>()
+ var currentSpan: MorphoSpan? = null
+ var currentFromTo: String? = null
+ var currentFName: String? = null
+
+ while (reader.hasNext()) {
+ val event = reader.next()
+ when (event) {
+ XMLStreamConstants.START_ELEMENT -> {
+ val localName = reader.localName
+ if (localName == "span") {
+ val fromAttr = reader.getAttributeValue(null, "from")
+ val toAttr = reader.getAttributeValue(null, "to")
+ if (!fromAttr.isNullOrEmpty() && !toAttr.isNullOrEmpty()) {
+ try {
+ val from = fromAttr.toInt()
+ val to = toAttr.toInt()
+ allSpans.add(Span(from, to))
+
+ if (reader.getAttributeValue(null, "type") != "alt") {
+ currentSpan = MorphoSpan()
+ currentFromTo = "$from-$to"
+ }
+ } catch (_: NumberFormatException) {}
+ }
+ } else if (localName == "f" && currentSpan != null) {
+ currentFName = reader.getAttributeValue(null, "name")
+ } else if (localName == "symbol" && currentSpan != null && currentFName == "type") {
+ val value = reader.getAttributeValue(null, "value")?.trim()
+ if (!value.isNullOrEmpty() && currentSpan.feats == "_") {
+ currentSpan.feats = value
+ }
+ }
+ }
+ XMLStreamConstants.CHARACTERS -> {
+ if (currentSpan != null && currentFName != null && !reader.isWhiteSpace) {
+ val value = reader.text.trim()
+ if (value.isNotEmpty()) {
+ when (currentFName) {
+ "lemma" -> if(currentSpan.lemma == "_") currentSpan.lemma = value.replace(UNKNOWN, "--")
+ "upos" -> currentSpan.upos = value
+ "xpos", "ctag", "pos" -> if(currentSpan.xpos == "_") currentSpan.xpos = value.replace(UNKNOWN, "--")
+ "feats", "msd" -> if(currentSpan.feats == "_") currentSpan.feats = value
+ "certainty" -> if(currentSpan.misc == "_") currentSpan.misc = value
+ }
+ }
+ }
+ }
+ XMLStreamConstants.END_ELEMENT -> {
+ val localName = reader.localName
+ if (localName == "span") {
+ if (currentSpan != null && currentFromTo != null) {
+ res[currentFromTo] = currentSpan
+ }
+ currentSpan = null
+ currentFromTo = null
+ } else if (localName == "f") {
+ currentFName = null
+ }
+ }
+ }
+ }
+ return Pair(res, allSpans.toTypedArray())
+ }
+
private fun extractMorphoSpans(
fsSpans: NodeList
): MutableMap<String, MorphoSpan> {
@@ -2610,6 +2880,48 @@
return res
}
+ private fun extractDependencySpansStax(reader: XMLStreamReader): MutableMap<String, MorphoSpan> {
+ val res: MutableMap<String, MorphoSpan> = HashMap()
+ var currentFromTo: String? = null
+ var currentDeprel: String? = null
+ var currentHead: String? = null
+ var spanDepth = 0
+
+ while (reader.hasNext()) {
+ val event = reader.next()
+ when (event) {
+ XMLStreamConstants.START_ELEMENT -> {
+ if (reader.localName == "span") {
+ spanDepth++
+ if (spanDepth == 1) {
+ currentFromTo = "${reader.getAttributeValue(null, "from")}-${reader.getAttributeValue(null, "to")}"
+ currentDeprel = null
+ currentHead = null
+ } else if (spanDepth == 2 && currentDeprel != null) {
+ val headFrom = reader.getAttributeValue(null, "from")
+ val headTo = reader.getAttributeValue(null, "to")
+ currentHead = "$headFrom-$headTo"
+ }
+ } else if (reader.localName == "rel" && spanDepth == 1) {
+ currentDeprel = reader.getAttributeValue(null, "label")
+ }
+ }
+ XMLStreamConstants.END_ELEMENT -> {
+ if (reader.localName == "span") {
+ if (spanDepth == 1 && currentFromTo != null && (currentHead != null || currentDeprel != null)) {
+ res[currentFromTo] = MorphoSpan(
+ head = currentHead ?: "_",
+ deprel = currentDeprel ?: "_"
+ )
+ }
+ spanDepth--
+ }
+ }
+ }
+ }
+ return res
+ }
+
private fun extractDependencySpans(
depSpans: NodeList
): MutableMap<String, MorphoSpan> {
@@ -2655,6 +2967,47 @@
}.toArray { size -> arrayOfNulls(size) }
}
+ private fun extractSentenceSpansStax(reader: XMLStreamReader): Array<Span> {
+ val list = ArrayList<Span>()
+ var currentFrom: Int? = null
+ var currentTo: Int? = null
+ var isSentence = false
+ var inF = false
+
+ while (reader.hasNext()) {
+ val event = reader.next()
+ when (event) {
+ XMLStreamConstants.START_ELEMENT -> {
+ if (reader.localName == "span") {
+ currentFrom = reader.getAttributeValue(null, "from")?.toIntOrNull()
+ currentTo = reader.getAttributeValue(null, "to")?.toIntOrNull()
+ isSentence = false
+ } else if (reader.localName == "f") {
+ inF = true
+ }
+ }
+ XMLStreamConstants.CHARACTERS -> {
+ if (inF && reader.text.trim() == "s") {
+ isSentence = true
+ }
+ }
+ XMLStreamConstants.END_ELEMENT -> {
+ if (reader.localName == "span") {
+ if (isSentence && currentFrom != null && currentTo != null) {
+ list.add(Span(currentFrom!!, currentTo!!))
+ }
+ currentFrom = null
+ currentTo = null
+ isSentence = false
+ } else if (reader.localName == "f") {
+ inF = false
+ }
+ }
+ }
+ }
+ return list.toTypedArray()
+ }
+
private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> {
val miscLocal: MutableMap<String, String> = HashMap()
@@ -3018,6 +3371,28 @@
}
}
+ private fun collectSentencesFromSpans(docId: String, foundry: String, spans: Array<Span>) {
+ if (outputTexts.contains(docId)) return
+ val textData = krillData.getOrPut(docId) { KrillJsonGenerator.KrillTextData(textId = docId) }
+ synchronized(textData) {
+ if (textData.sentencesCollectedByFoundry.contains(foundry)) return
+ for (span in spans) {
+ textData.structureSpans.add(
+ KrillJsonGenerator.StructureSpan(
+ layer = "$foundry/s:s",
+ from = span.from,
+ to = span.to,
+ tokenFrom = -1,
+ tokenTo = -1,
+ depth = 0,
+ attributes = emptyMap()
+ )
+ )
+ }
+ textData.sentencesCollectedByFoundry.add(foundry)
+ }
+ }
+
private fun collectConstituency(docId: String, foundry: String, spans: NodeList) {
if (outputTexts.contains(docId)) return