Fixed OutOfMemoryError when annotating very long texts
(multi-volume novels) by streaming XML directly to output instead of
materializing as String
Change-Id: I78ae5de59443203310744aa6af5cdc2f95acfcd3
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62eb1c3..06d9697 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
# Changelog
+## [v3.2.1] - unreleased
+
+### Fixed
+
+- Fixed OutOfMemoryError when annotating very long texts (novels) with ZIP output by streaming XML directly to output instead of materializing as String
+
## [v3.2.0] - 2026-03-15
### Fixed
@@ -7,6 +13,7 @@
- Fixed heap issues with krill conversion
- For krill output and morpho.xml files, bypass XMLCommentFilterReader
- Fixed progress bar showing the full path of the file instead of the file name ([#17](https://github.com/KorAP/korapxmltool/issues/17))
+- Fixed OutOfMemoryError when annotating very long texts (novels) with ZIP output by streaming XML directly to output instead of materializing as String
### Added
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 2afaef7..eda59cb 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -3144,13 +3144,12 @@
compatibilityMode = COMPATIBILITY_MODE,
tokenSeparator = tokenSeparator
)
- val morphoXml = KorapXmlFormatter.formatMorpho(context, dBuilder!!).toString()
val morphoPath = docId.replace(Regex("[_.]"), "/") + "/$morphoDir/morpho.xml"
val morphoEntry = ZipArchiveEntry(morphoPath)
morphoEntry.unixMode = ZIP_ENTRY_UNIX_MODE
synchronized(morphoZipOutputStream!!) {
morphoZipOutputStream!!.putArchiveEntry(morphoEntry)
- morphoZipOutputStream!!.write(morphoXml.toByteArray())
+ KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, morphoZipOutputStream!!)
morphoZipOutputStream!!.closeArchiveEntry()
}
wroteOne = true
@@ -3176,13 +3175,12 @@
compatibilityMode = COMPATIBILITY_MODE,
tokenSeparator = tokenSeparator
)
- val depXml = KorapXmlFormatter.formatDependency(context, dBuilder!!).toString()
val depPath = docId.replace(Regex("[_.]"), "/") + "/$depDir/dependency.xml"
val depEntry = ZipArchiveEntry(depPath)
depEntry.unixMode = ZIP_ENTRY_UNIX_MODE
synchronized(morphoZipOutputStream!!) {
morphoZipOutputStream!!.putArchiveEntry(depEntry)
- morphoZipOutputStream!!.write(depXml.toByteArray())
+ KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
morphoZipOutputStream!!.closeArchiveEntry()
}
wroteOne = true
@@ -3209,13 +3207,12 @@
compatibilityMode = COMPATIBILITY_MODE,
tokenSeparator = tokenSeparator
)
- val constXml = KorapXmlFormatter.formatConstituency(context, dBuilder!!).toString()
val constPath = docId.replace(Regex("[_.]"), "/") + "/$constDir/constituency.xml"
val constEntry = ZipArchiveEntry(constPath)
constEntry.unixMode = ZIP_ENTRY_UNIX_MODE
synchronized(morphoZipOutputStream!!) {
morphoZipOutputStream!!.putArchiveEntry(constEntry)
- morphoZipOutputStream!!.write(constXml.toByteArray())
+ KorapXmlFormatter.formatConstituencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
morphoZipOutputStream!!.closeArchiveEntry()
}
wroteOne = true
@@ -4161,7 +4158,7 @@
try {
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
- docId = tempDocId,
+ docId = docId,
foundry = actualFoundry,
tokens = tokens[tempDocId],
sentences = sentences[tempDocId],
@@ -4179,11 +4176,6 @@
compatibilityMode = COMPATIBILITY_MODE,
tokenSeparator = tokenSeparator
)
- val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
- val fixedMorphoXml = morphoXmlOutput.toString().replace(
- "docid=\"$tempDocId\"",
- "docid=\"$docId\""
- )
val morphoEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/morpho.xml"
@@ -4191,7 +4183,7 @@
morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
synchronized(morphoZipOutputStream!!) {
morphoZipOutputStream!!.putArchiveEntry(morphoZipEntry)
- morphoZipOutputStream!!.write(fixedMorphoXml.toByteArray())
+ KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, morphoZipOutputStream!!)
morphoZipOutputStream!!.closeArchiveEntry()
}
val written = docsWrittenToZip.incrementAndGet()
@@ -4204,7 +4196,7 @@
if (morpho[tempDocId]?.values?.any { it.head != null && it.head != "_" && it.deprel != null && it.deprel != "_" } == true) {
try {
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
- docId = tempDocId,
+ docId = docId,
foundry = actualFoundry,
tokens = tokens[tempDocId],
sentences = sentences[tempDocId],
@@ -4222,11 +4214,6 @@
compatibilityMode = COMPATIBILITY_MODE,
tokenSeparator = tokenSeparator
)
- val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
- val fixedDependencyXml = dependencyXmlOutput.toString().replace(
- "docid=\"$tempDocId\"",
- "docid=\"$docId\""
- )
val dependencyEntryPath = docId.replace(Regex("[_.]"), "/") + "/$actualFoundry/dependency.xml"
@@ -4234,7 +4221,7 @@
dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
synchronized(morphoZipOutputStream!!) {
morphoZipOutputStream!!.putArchiveEntry(dependencyZipEntry)
- morphoZipOutputStream!!.write(fixedDependencyXml.toByteArray())
+ KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, morphoZipOutputStream!!)
morphoZipOutputStream!!.closeArchiveEntry()
}
} catch (e: Exception) {
@@ -4462,7 +4449,7 @@
val morphoPath = "$basePath/$morphoFoundry/morpho.xml"
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
- docId = tempDocId,
+ docId = doc.textId,
foundry = morphoFoundry,
tokens = getTokenSpansFromMorho(morphoSpans),
sentences = sentences[tempDocId],
@@ -4481,19 +4468,13 @@
tokenSeparator = tokenSeparator
)
- val morphoXmlOutput = KorapXmlFormatter.formatMorpho(context, dBuilder!!)
- val fixedMorphoXml = morphoXmlOutput.toString().replace(
- "docid=\"$tempDocId\"",
- "docid=\"${doc.textId}\""
- )
-
val morphoZipEntry = ZipArchiveEntry(morphoPath)
morphoZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
zipOutputStream.putArchiveEntry(morphoZipEntry)
- zipOutputStream.write(fixedMorphoXml.toByteArray())
+ KorapXmlFormatter.formatMorphoToStream(context, dBuilder!!, zipOutputStream)
zipOutputStream.closeArchiveEntry()
- LOGGER.fine("Wrote $morphoPath (${fixedMorphoXml.length} bytes)")
+ LOGGER.fine("Wrote $morphoPath")
} catch (e: Exception) {
LOGGER.severe("ERROR generating morpho.xml for ${doc.textId}: ${e.message}")
throw e
@@ -4506,7 +4487,7 @@
val dependencyPath = "$basePath/$dependencyFoundry/dependency.xml"
val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
- docId = tempDocId,
+ docId = doc.textId,
foundry = dependencyFoundry,
tokens = getTokenSpansFromMorho(morphoSpans),
sentences = sentences[tempDocId],
@@ -4525,19 +4506,13 @@
tokenSeparator = tokenSeparator
)
- val dependencyXmlOutput = KorapXmlFormatter.formatDependency(context, dBuilder!!)
- val fixedDependencyXml = dependencyXmlOutput.toString().replace(
- "docid=\"$tempDocId\"",
- "docid=\"${doc.textId}\""
- )
-
val dependencyZipEntry = ZipArchiveEntry(dependencyPath)
dependencyZipEntry.unixMode = ZIP_ENTRY_UNIX_MODE
zipOutputStream.putArchiveEntry(dependencyZipEntry)
- zipOutputStream.write(fixedDependencyXml.toByteArray())
+ KorapXmlFormatter.formatDependencyToStream(context, dBuilder!!, zipOutputStream)
zipOutputStream.closeArchiveEntry()
- LOGGER.fine("Wrote $dependencyPath (${fixedDependencyXml.length} bytes)")
+ LOGGER.fine("Wrote $dependencyPath")
} catch (e: Exception) {
LOGGER.severe("ERROR generating dependency.xml for ${doc.textId}: ${e.message}")
throw e
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
index df01b27..ec9a5aa 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/KorapXmlFormatter.kt
@@ -2,6 +2,7 @@
import de.ids_mannheim.korapxmltools.ConstituencyParserBridge
import org.w3c.dom.Document
+import java.io.OutputStream
import java.io.StringWriter
import java.util.logging.Logger
import javax.xml.parsers.DocumentBuilder
@@ -35,9 +36,9 @@
}
/**
- * Format morphological annotations as KorAP-XML.
+ * Build the morpho DOM document from the given context.
*/
- fun formatMorpho(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ private fun buildMorphoDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
val doc: Document = dBuilder.newDocument()
// Root element
@@ -117,13 +118,29 @@
spanList.appendChild(spanNode)
}
- return transformToString(doc)
+ return doc
}
/**
- * Format dependency annotations as KorAP-XML.
+ * Format morphological annotations as KorAP-XML.
*/
- fun formatDependency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ fun formatMorpho(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ return transformToString(buildMorphoDocument(context, dBuilder))
+ }
+
+ /**
+ * Stream morphological annotations as KorAP-XML directly to an OutputStream.
+ * Avoids materializing the entire XML as a String, which can exceed the JVM's
+ * 2 GB per-array limit for very long texts (e.g. novels).
+ */
+ fun formatMorphoToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+ transformToStream(buildMorphoDocument(context, dBuilder), out)
+ }
+
+ /**
+ * Build the dependency DOM document from the given context.
+ */
+ private fun buildDependencyDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
val doc: Document = dBuilder.newDocument()
// Root element
@@ -188,13 +205,27 @@
spanList.appendChild(spanNode)
}
- return transformToString(doc)
+ return doc
}
/**
- * Format constituency annotations as KorAP-XML.
+ * Format dependency annotations as KorAP-XML.
*/
- fun formatConstituency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ fun formatDependency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ return transformToString(buildDependencyDocument(context, dBuilder))
+ }
+
+ /**
+ * Stream dependency annotations as KorAP-XML directly to an OutputStream.
+ */
+ fun formatDependencyToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+ transformToStream(buildDependencyDocument(context, dBuilder), out)
+ }
+
+ /**
+ * Build the constituency DOM document from the given context.
+ */
+ private fun buildConstituencyDocument(context: OutputContext, dBuilder: DocumentBuilder): Document {
val doc: Document = dBuilder.newDocument()
// Root element
@@ -210,7 +241,7 @@
val trees = context.constituencyTrees
if (trees == null || trees.isEmpty()) {
LOGGER.warning("No constituency trees found for ${context.docId}")
- return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+ return doc
}
// Process each tree
@@ -255,10 +286,35 @@
}
}
+ return doc
+ }
+
+ /**
+ * Format constituency annotations as KorAP-XML.
+ */
+ fun formatConstituency(context: OutputContext, dBuilder: DocumentBuilder): StringBuilder {
+ val doc = buildConstituencyDocument(context, dBuilder)
+ // Check for empty trees case
+ val trees = context.constituencyTrees
+ if (trees == null || trees.isEmpty()) {
+ return StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
+ }
return transformToString(doc, indentAmount = "3")
}
/**
+ * Stream constituency annotations as KorAP-XML directly to an OutputStream.
+ */
+ fun formatConstituencyToStream(context: OutputContext, dBuilder: DocumentBuilder, out: OutputStream) {
+ val trees = context.constituencyTrees
+ if (trees == null || trees.isEmpty()) {
+ out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".toByteArray())
+ return
+ }
+ transformToStream(buildConstituencyDocument(context, dBuilder), out, indentAmount = "3")
+ }
+
+ /**
* Transform DOM document to formatted XML string.
*/
private fun transformToString(doc: Document, indentAmount: String = "1"): StringBuilder {
@@ -273,4 +329,19 @@
return StringBuilder(streamResult.writer.toString())
}
+
+ /**
+ * Transform DOM document directly to an OutputStream, avoiding String materialization.
+ * This prevents java.lang.OutOfMemoryError for very large documents that would
+ * exceed the JVM's ~2 GB per-array limit when serialized as a String.
+ */
+ private fun transformToStream(doc: Document, out: OutputStream, indentAmount: String = "1") {
+ val transformerFactory = TransformerFactory.newInstance()
+ val transformer = transformerFactory.newTransformer()
+ transformer.setOutputProperty(OutputKeys.INDENT, "yes")
+ transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no")
+ transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", indentAmount)
+ val domSource = DOMSource(doc)
+ transformer.transform(domSource, StreamResult(out))
+ }
}