Add open --stax-text
Change-Id: I67f02b7207804f96ba075ae3638b3aa195d8236f
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 7b70a6a..f402e00 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -316,6 +316,15 @@
)
var lemmaOnly: Boolean = false
+ @Option(
+ names = ["--stax-text"],
+ description = [
+ "Parse data.xml text content with StAX instead of DOM.",
+ "Default is DOM; this is mainly useful for benchmarking or corpora with many short texts."
+ ]
+ )
+ var useStaxTextParser: Boolean = false
+
private var taggerName: String? = null
private var taggerModel: String? = null
private var dockerLogMessage: String? = null
@@ -696,6 +705,7 @@
if (overwrite) sb.appendLine(" --force")
if (useLemma) sb.appendLine(" --lemma")
if (lemmaOnly) sb.appendLine(" --lemma-only")
+ if (useStaxTextParser) sb.appendLine(" --stax-text")
if (useLz4) sb.appendLine(" --lz4")
if (includeNonWordTokens) sb.appendLine(" --non-word-tokens")
if (sequentialInZip) sb.appendLine(" --sequential")
@@ -968,6 +978,14 @@
taggerName == null &&
parserName == null
+ internal fun canUseStaxTextParsing(): Boolean =
+ outputFormat == OutputFormat.CONLLU ||
+ outputFormat == OutputFormat.WORD2VEC ||
+ outputFormat == OutputFormat.NOW
+
+ internal fun shouldParseDataXmlWithStax(): Boolean =
+ useStaxTextParser && canUseStaxTextParsing()
+
internal fun registerZipProgress(zipPath: String, size: Long) {
zipSizes[zipPath] = size
zipProgressBytes[zipPath] = AtomicLong(0)
@@ -1334,7 +1352,8 @@
LOGGER.info("Initialized work-stealing scheduler with $maxThreads worker threads for Krill output")
} else if (canStreamNowEntriesImmediately()) {
entryExecutor = null
- LOGGER.info("Initialized NOW streaming mode: archive-order entries, no text-ID scheduling")
+ val textParserMode = if (shouldParseDataXmlWithStax()) "StAX" else "DOM"
+ LOGGER.info("Initialized NOW streaming mode: archive-order entries, no text-ID scheduling, data.xml via $textParserMode")
} else {
// For other formats, use priority-based executor
entryExecutor = java.util.concurrent.ThreadPoolExecutor(
@@ -2825,10 +2844,11 @@
val needsDom = isStructure && (extractAttributesRegex.isNotEmpty() || outputFormat == OutputFormat.KRILL)
val isConstituency = zipEntry.name.endsWith("constituency.xml")
val isData = zipEntry.name.endsWith("data.xml")
+ val useStaxForData = isData && shouldParseDataXmlWithStax()
// Use DOM for data.xml (large text content) and structure/constituency (complex parsing)
// Use StAX for annotation files (morpho, dependency, tokens, sentences) for better performance
- if (!needsDom && !isConstituency && !isData) {
+ if (!needsDom && !isConstituency && (!isData || useStaxForData)) {
processXmlEntryStax(zipFile, zipPath, zipEntry, foundry, waitForMorpho)
return
}
@@ -4075,12 +4095,15 @@
private fun extractTextStax(reader: XMLStreamReader): String? {
val textBuilder = StringBuilder()
+ var insideText = false
while (reader.hasNext()) {
val event = reader.next()
- if (event == XMLStreamConstants.CHARACTERS) {
+ if ((event == XMLStreamConstants.CHARACTERS || event == XMLStreamConstants.CDATA) && insideText) {
textBuilder.append(reader.text)
- } else if (event == XMLStreamConstants.END_ELEMENT && reader.localName == "raw_text") {
- break
+ } else if (event == XMLStreamConstants.START_ELEMENT && reader.localName == "text") {
+ insideText = true
+ } else if (event == XMLStreamConstants.END_ELEMENT && reader.localName == "text") {
+ return if (textBuilder.isNotEmpty()) textBuilder.toString() else ""
}
}
return if (textBuilder.isNotEmpty()) textBuilder.toString() else null
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
index 7e7a2bc..6668220 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
@@ -142,6 +142,10 @@
tool.outputFormat = OutputFormat.NOW
assertTrue(tool.canStreamNowEntriesImmediately())
+ assertTrue(tool.canUseStaxTextParsing())
+ assertTrue(!tool.shouldParseDataXmlWithStax())
+ tool.useStaxTextParser = true
+ assertTrue(tool.shouldParseDataXmlWithStax())
}
@Test
@@ -150,6 +154,17 @@
tool.outputFormat = OutputFormat.CONLLU
assertTrue(!tool.canStreamNowEntriesImmediately())
+ assertTrue(tool.canUseStaxTextParsing())
+ }
+
+ @Test
+ fun zipOutputKeepsDomParsingForDataXml() {
+ val tool = KorapXmlTool()
+ tool.outputFormat = OutputFormat.KORAP_XML
+
+ tool.useStaxTextParser = true
+ assertTrue(!tool.canUseStaxTextParsing())
+ assertTrue(!tool.shouldParseDataXmlWithStax())
}
@Test