Fixed OutOfMemoryError when annotating very long texts

(multi-volume novels) by streaming XML directly to output instead of
materializing as String

Change-Id: I78ae5de59443203310744aa6af5cdc2f95acfcd3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62eb1c3..06d9697 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## [v3.2.1] - unreleased
+
+### Fixed
+
+- Fixed OutOfMemoryError when annotating very long texts (novels) with ZIP output by streaming XML directly to output instead of materializing as String
+
 ## [v3.2.0] - 2026-03-15
 
 ### Fixed
@@ -7,6 +13,7 @@
 - Fixed heap issues with krill conversion
 - For krill output and morpho.xml files, bypass XMLCommentFilterReader
 - Fixed progress bar showing the full path of the file instead of the file name ([#17](https://github.com/KorAP/korapxmltool/issues/17))
+- Fixed OutOfMemoryError when annotating very long texts (novels) with ZIP output by streaming XML directly to output instead of materializing as String
 
 ### Added
 
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 2afaef7..eda59cb 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -3144,13 +3144,12 @@
                     compatibilityMode = COMPATIBILITY_MODE,
                     tokenSeparator = tokenSeparator
                 )
-                val morphoXml = KorapXmlFormatter.formatMorpho(context, dBuilder!!).toString()
                 val morphoPath = docId.replace(Regex("[_.]"), "/") + "/$morphoDir/morpho.xml"
                  val morphoEntry = ZipArchiveEntry(morphoPath)
                  morphoEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                  synchronized(morphoZipOutputStream!!) {
                      morphoZipOutputStream!!.putArchiveEntry(morphoEntry)
-                     morphoZipOutputStream!!.write(morphoXml.toByteArray())
+                     KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, morphoZipOutputStream!!)
                      morphoZipOutputStream!!.closeArchiveEntry()
                  }
                  wroteOne = true
@@ -3176,13 +3175,12 @@
                     compatibilityMode = COMPATIBILITY_MODE,
                     tokenSeparator = tokenSeparator
                 )
-                val depXml = KorapXmlFormatter.formatDependency(context, dBuilder!!).toString()
                 val depPath = docId.replace(Regex("[_.]"), "/") + "/$depDir/dependency.xml"
                  val depEntry = ZipArchiveEntry(depPath)
                  depEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                  synchronized(morphoZipOutputStream!!) {
                      morphoZipOutputStream!!.putArchiveEntry(depEntry)
-                     morphoZipOutputStream!!.write(depXml.toByteArray())
+                     KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
                      morphoZipOutputStream!!.closeArchiveEntry()
                  }
                  wroteOne = true
@@ -3209,13 +3207,12 @@
                     compatibilityMode = COMPATIBILITY_MODE,
                     tokenSeparator = tokenSeparator
                 )
-                val constXml = KorapXmlFormatter.formatConstituency(context, dBuilder!!).toString()
                 val constPath = docId.replace(Regex("[_.]"), "/") + "/$constDir/constituency.xml"
                  val constEntry = ZipArchiveEntry(constPath)
                  constEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                  synchronized(morphoZipOutputStream!!) {
                      morphoZipOutputStream!!.putArchiveEntry(constEntry)
-                     morphoZipOutputStream!!.write(constXml.toByteArray())
+                     KorapXmlFormatter.formatConstituencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
                      morphoZipOutputStream!!.closeArchiveEntry()
                  }
                  wroteOne = true
@@ -4161,7 +4158,7 @@
 
         try {
             val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
-                docId = tempDocId,
+                docId = docId,
                 foundry = actualFoundry,
                 tokens = tokens[tempDocId],
                 sentences = sentences[tempDocId],
@@ -4179,11 +4176,6 @@
                 compatibilityMode = COMPATIBILITY_MODE,
                 tokenSeparator = tokenSeparator
             )
-            val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
-            val fixedMorphoXml = morphoXmlOutput.toString().replace(
-                "docid=\"$tempDocId\"",
-                "docid=\"$docId\""
-            )
 
             val morphoEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/morpho.xml"
 
@@ -4191,7 +4183,7 @@
             morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
             synchronized(morphoZipOutputStream!!) {
                 morphoZipOutputStream!!.putArchiveEntry(morphoZipEntry)
-                morphoZipOutputStream!!.write(fixedMorphoXml.toByteArray())
+                KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, morphoZipOutputStream!!)
                 morphoZipOutputStream!!.closeArchiveEntry()
             }
             val written = docsWrittenToZip.incrementAndGet()
@@ -4204,7 +4196,7 @@
         if (morpho[tempDocId]?.values?.any { it.head != null && it.head != "_" && it.deprel != null && it.deprel != "_" } == true) {
             try {
                 val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
-                    docId = tempDocId,
+                    docId = docId,
                     foundry = actualFoundry,
                     tokens = tokens[tempDocId],
                     sentences = sentences[tempDocId],
@@ -4222,11 +4214,6 @@
                     compatibilityMode = COMPATIBILITY_MODE,
                     tokenSeparator = tokenSeparator
                 )
-                val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
-                val fixedDependencyXml = dependencyXmlOutput.toString().replace(
-                    "docid=\"$tempDocId\"",
-                    "docid=\"$docId\""
-                )
 
                 val dependencyEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/dependency.xml"
 
@@ -4234,7 +4221,7 @@
                 dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                 synchronized(morphoZipOutputStream!!) {
                     morphoZipOutputStream!!.putArchiveEntry(dependencyZipEntry)
-                    morphoZipOutputStream!!.write(fixedDependencyXml.toByteArray())
+                    KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
                     morphoZipOutputStream!!.closeArchiveEntry()
                 }
             } catch (e: Exception) {
@@ -4462,7 +4449,7 @@
                     val morphoPath = "$basePath/$morphoFoundry/morpho.xml"
 
                     val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
-                        docId = tempDocId,
+                        docId = doc.textId,
                         foundry = morphoFoundry,
                         tokens = getTokenSpansFromMorho(morphoSpans),
                         sentences = sentences[tempDocId],
@@ -4481,19 +4468,13 @@
                         tokenSeparator = tokenSeparator
                     )
 
-                    val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
-                    val fixedMorphoXml = morphoXmlOutput.toString().replace(
-                        "docid=\"$tempDocId\"",
-                        "docid=\"${doc.textId}\""
-                    )
-
                     val morphoZipEntry = ZipArchiveEntry(morphoPath)
                     morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                     zipOutputStream.putArchiveEntry(morphoZipEntry)
-                    zipOutputStream.write(fixedMorphoXml.toByteArray())
+                    KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, zipOutputStream)
                     zipOutputStream.closeArchiveEntry()
 
-                    LOGGER.fine("Wrote $morphoPath (${fixedMorphoXml.length} bytes)")
+                    LOGGER.fine("Wrote $morphoPath")
                 } catch (e: Exception) {
                     LOGGER.severe("ERROR generating morpho.xml for ${doc.textId}: ${e.message}")
                     throw e
@@ -4506,7 +4487,7 @@
                         val dependencyPath = "$basePath/$dependencyFoundry/dependency.xml"
 
                         val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
-                            docId = tempDocId,
+                            docId = doc.textId,
                             foundry = dependencyFoundry,
                             tokens = getTokenSpansFromMorho(morphoSpans),
                             sentences = sentences[tempDocId],
@@ -4525,19 +4506,13 @@
                             tokenSeparator = tokenSeparator
                         )
 
-                        val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
-                        val fixedDependencyXml = dependencyXmlOutput.toString().replace(
-                            "docid=\"$tempDocId\"",
-                            "docid=\"${doc.textId}\""
-                        )
-
                         val dependencyZipEntry = ZipArchiveEntry(dependencyPath)
                         dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
                         zipOutputStream.putArchiveEntry(dependencyZipEntry)
-                        zipOutputStream.write(fixedDependencyXml.toByteArray())
+                        KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, zipOutputStream)
                         zipOutputStream.closeArchiveEntry()
 
-                        LOGGER.fine("Wrote $dependencyPath (${fixedDependencyXml.length} bytes)")
+                        LOGGER.fine("Wrote $dependencyPath")
                     } catch (e: Exception) {
                         LOGGER.severe("ERROR generating dependency.xml for ${doc.textId}: ${e.message}")
                         throw e
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
index df01b27..ec9a5aa 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
@@ -2,6 +2,7 @@
 
 import de.ids_mannheim.korapxmltools.ConstituencyParserBridge
 import org.w3c.dom.Document
+import java.io.OutputStream
 import java.io.StringWriter
 import java.util.logging.Logger
 import javax.xml.parsers.DocumentBuilder
@@ -35,9 +36,9 @@
     }
 
     /**
-     * Format morphological annotations as KorAP-XML.
+     * Build the morpho DOM document from the given context.
      */
-    fun formatMorpho(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+    private fun buildMorphoDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
         val doc: Document = dBuilder.newDocument()
 
         // Root element
@@ -117,13 +118,29 @@
             spanList.appendChild(spanNode)
         }
         
-        return transformToString(doc)
+        return doc
     }
 
     /**
-     * Format dependency annotations as KorAP-XML.
+     * Format morphological annotations as KorAP-XML.
      */
-    fun formatDependency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+    fun formatMorpho(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+        return transformToString(buildMorphoDocument(context, dBuilder))
+    }
+
+    /**
+     * Stream morphological annotations as KorAP-XML directly to an OutputStream.
+     * Avoids materializing the entire XML as a String, which can exceed the JVM's
+     * 2 GB per-array limit for very long texts (e.g. novels).
+     */
+    fun formatMorphoToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+        transformToStream(buildMorphoDocument(context, dBuilder), out)
+    }
+
+    /**
+     * Build the dependency DOM document from the given context.
+     */
+    private fun buildDependencyDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
         val doc: Document = dBuilder.newDocument()
 
         // Root element
@@ -188,13 +205,27 @@
             spanList.appendChild(spanNode)
         }
         
-        return transformToString(doc)
+        return doc
     }
 
     /**
-     * Format constituency annotations as KorAP-XML.
+     * Format dependency annotations as KorAP-XML.
      */
-    fun formatConstituency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+    fun formatDependency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+        return transformToString(buildDependencyDocument(context, dBuilder))
+    }
+
+    /**
+     * Stream dependency annotations as KorAP-XML directly to an OutputStream.
+     */
+    fun formatDependencyToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+        transformToStream(buildDependencyDocument(context, dBuilder), out)
+    }
+
+    /**
+     * Build the constituency DOM document from the given context.
+     */
+    private fun buildConstituencyDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
         val doc: Document = dBuilder.newDocument()
 
         // Root element
@@ -210,7 +241,7 @@
         val trees = context.constituencyTrees
         if (trees == null || trees.isEmpty()) {
             LOGGER.warning("No constituency trees found for ${context.docId}")
-            return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+            return doc
         }
 
         // Process each tree
@@ -255,10 +286,35 @@
             }
         }
 
+        return doc
+    }
+
+    /**
+     * Format constituency annotations as KorAP-XML.
+     */
+    fun formatConstituency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+        val doc = buildConstituencyDocument(context, dBuilder)
+        // Check for empty trees case
+        val trees = context.constituencyTrees
+        if (trees == null || trees.isEmpty()) {
+            return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+        }
         return transformToString(doc, indentAmount = "3")
     }
 
     /**
+     * Stream constituency annotations as KorAP-XML directly to an OutputStream.
+     */
+    fun formatConstituencyToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+        val trees = context.constituencyTrees
+        if (trees == null || trees.isEmpty()) {
+            out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".toByteArray())
+            return
+        }
+        transformToStream(buildConstituencyDocument(context, dBuilder), out, indentAmount = "3")
+    }
+
+    /**
      * Transform DOM document to formatted XML string.
      */
     private fun transformToString(doc: Document, indentAmount: String = "1"): StringBuilder {
@@ -273,4 +329,19 @@
 
         return StringBuilder(streamResult.writer.toString())
     }
+
+    /**
+     * Transform DOM document directly to an OutputStream, avoiding String materialization.
+     * This prevents java.lang.OutOfMemoryError for very large documents that would
+     * exceed the JVM's ~2 GB per-array limit when serialized as a String.
+     */
+    private fun transformToStream(doc: Document, out: OutputStream, indentAmount: String = "1") {
+        val transformerFactory = TransformerFactory.newInstance()
+        val transformer = transformerFactory.newTransformer()
+        transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+        transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", indentAmount)
+        val domSource = DOMSource(doc)
+        transformer.transform(domSource, StreamResult(out))
+    }
 }