Use thread local document builders in krill output
Change-Id: I123d5b7015ae4f6ac361884c158794640518be23
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 5147734..95fd2aa 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -649,6 +649,11 @@
var krillTarOutputStream: TarArchiveOutputStream? = null
var krillOutputFileName: String? = null
+ // Thread-local DocumentBuilder pool for parallel processing
+ private val threadLocalBuilder: ThreadLocal<DocumentBuilder> = ThreadLocal.withInitial {
+ fastDomFactory.newDocumentBuilder()
+ }
+
private val safeDomFactory: DocumentBuilderFactory by lazy {
DocumentBuilderFactory.newInstance().apply {
isNamespaceAware = false
@@ -1570,9 +1575,11 @@
try {
if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho|dependency|sentences|constituency)\\.xml$"))) {
LOGGER.finer("Processing entry: ${zipEntry.name}, foundry=$foundry")
- // Ensure the entry stream and reader are closed to avoid native memory buildup
- val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
- val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
+ // Use thread-local DocumentBuilder (reused, much faster than creating new ones)
+ val dBuilder: DocumentBuilder = threadLocalBuilder.get()
+ // Reset the builder state to avoid memory leaks
+ dBuilder.reset()
+
// In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure
if (lemmaOnly && zipEntry.name.endsWith("data.xml")) {
return
@@ -3647,8 +3654,8 @@
val isAnnotationFoundry = zipName.matches(Regex(".*\\.[^/.]+\\.zip$"))
try {
- val dbFactory = DocumentBuilderFactory.newInstance()
- val dBuilder = dbFactory.newDocumentBuilder()
+ // Use thread-local DocumentBuilder
+ val dBuilder = threadLocalBuilder.get()
openZipFile(zipPath).use { zipFile ->
val entries = zipFile.entries
@@ -3664,6 +3671,7 @@
if (entry.name.matches(pattern)) {
try {
+ dBuilder.reset()
// Parse XML to extract docId attribute
val doc = zipFile.getInputStream(entry).use { inputStream ->
XMLCommentFilterReader(inputStream, "UTF-8").use { reader ->