Add open --stax-text

Change-Id: I67f02b7207804f96ba075ae3638b3aa195d8236f
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 7b70a6a..f402e00 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -316,6 +316,15 @@
     )
     var lemmaOnly: Boolean = false
 
+    @Option(
+        names = ["--stax-text"],
+        description = [
+            "Parse data.xml text content with StAX instead of DOM.",
+            "Default is DOM; this is mainly useful for benchmarking or corpora with many short texts."
+        ]
+    )
+    var useStaxTextParser: Boolean = false
+
     private var taggerName: String? = null
     private var taggerModel: String? = null
     private var dockerLogMessage: String? = null
@@ -696,6 +705,7 @@
         if (overwrite)            sb.appendLine("  --force")
         if (useLemma)             sb.appendLine("  --lemma")
         if (lemmaOnly)            sb.appendLine("  --lemma-only")
+        if (useStaxTextParser)    sb.appendLine("  --stax-text")
         if (useLz4)               sb.appendLine("  --lz4")
         if (includeNonWordTokens) sb.appendLine("  --non-word-tokens")
         if (sequentialInZip)      sb.appendLine("  --sequential")
@@ -968,6 +978,14 @@
             taggerName == null &&
             parserName == null
 
+    internal fun canUseStaxTextParsing(): Boolean =
+        outputFormat == OutputFormat.CONLLU ||
+            outputFormat == OutputFormat.WORD2VEC ||
+            outputFormat == OutputFormat.NOW
+
+    internal fun shouldParseDataXmlWithStax(): Boolean =
+        useStaxTextParser && canUseStaxTextParsing()
+
     internal fun registerZipProgress(zipPath: String, size: Long) {
         zipSizes[zipPath] = size
         zipProgressBytes[zipPath] = AtomicLong(0)
@@ -1334,7 +1352,8 @@
             LOGGER.info("Initialized work-stealing scheduler with $maxThreads worker threads for Krill output")
         } else if (canStreamNowEntriesImmediately()) {
             entryExecutor = null
-            LOGGER.info("Initialized NOW streaming mode: archive-order entries, no text-ID scheduling")
+            val textParserMode = if (shouldParseDataXmlWithStax()) "StAX" else "DOM"
+            LOGGER.info("Initialized NOW streaming mode: archive-order entries, no text-ID scheduling, data.xml via $textParserMode")
         } else {
             // For other formats, use priority-based executor
             entryExecutor = java.util.concurrent.ThreadPoolExecutor(
@@ -2825,10 +2844,11 @@
                  val needsDom = isStructure && (extractAttributesRegex.isNotEmpty() || outputFormat == OutputFormat.KRILL)
                  val isConstituency = zipEntry.name.endsWith("constituency.xml")
                  val isData = zipEntry.name.endsWith("data.xml")
+                 val useStaxForData = isData && shouldParseDataXmlWithStax()
                  
                  // Use DOM for data.xml (large text content) and structure/constituency (complex parsing)
                  // Use StAX for annotation files (morpho, dependency, tokens, sentences) for better performance
-                 if (!needsDom && !isConstituency && !isData) {
+                 if (!needsDom && !isConstituency && (!isData || useStaxForData)) {
                      processXmlEntryStax(zipFile, zipPath, zipEntry, foundry, waitForMorpho)
                      return
                  }
@@ -4075,12 +4095,15 @@
 
     private fun extractTextStax(reader: XMLStreamReader): String? {
         val textBuilder = StringBuilder()
+        var insideText = false
         while (reader.hasNext()) {
             val event = reader.next()
-            if (event == XMLStreamConstants.CHARACTERS) {
+            if ((event == XMLStreamConstants.CHARACTERS || event == XMLStreamConstants.CDATA) && insideText) {
                 textBuilder.append(reader.text)
-            } else if (event == XMLStreamConstants.END_ELEMENT && reader.localName == "raw_text") {
-                break
+            } else if (event == XMLStreamConstants.START_ELEMENT && reader.localName == "text") {
+                insideText = true
+            } else if (event == XMLStreamConstants.END_ELEMENT && reader.localName == "text") {
+                return if (textBuilder.isNotEmpty()) textBuilder.toString() else ""
             }
         }
         return if (textBuilder.isNotEmpty()) textBuilder.toString() else null
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
index 7e7a2bc..6668220 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/GeneralFeaturesTest.kt
@@ -142,6 +142,10 @@
         tool.outputFormat = OutputFormat.NOW
 
         assertTrue(tool.canStreamNowEntriesImmediately())
+        assertTrue(tool.canUseStaxTextParsing())
+        assertTrue(!tool.shouldParseDataXmlWithStax())
+        tool.useStaxTextParser = true
+        assertTrue(tool.shouldParseDataXmlWithStax())
     }
 
     @Test
@@ -150,6 +154,17 @@
         tool.outputFormat = OutputFormat.CONLLU
 
         assertTrue(!tool.canStreamNowEntriesImmediately())
+        assertTrue(tool.canUseStaxTextParsing())
+    }
+
+    @Test
+    fun zipOutputKeepsDomParsingForDataXml() {
+        val tool = KorapXmlTool()
+        tool.outputFormat = OutputFormat.KORAP_XML
+
+        tool.useStaxTextParser = true
+        assertTrue(!tool.canUseStaxTextParsing())
+        assertTrue(!tool.shouldParseDataXmlWithStax())
     }
 
     @Test