For krill output and morpho.xml files, bypass XMLCommentFilterReader
Change-Id: I2ee10cb5e798b7e07fcda43d2538cc09a351c416
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 83cf74f..b87f88a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
- Fixed `-L` (log directory) option being ignored when using internal taggers (`-T opennlp`, `-T marmot`, etc.)
- Renamed `textExternalLinks` metadata field to `textExternalLink` (singular) ([#26](https://github.com/KorAP/korapxmltool/issues/26))
+- Use `rend` attribute as external link title, if available ([#27](https://github.com/KorAP/korapxmltool/issues/27))
## [v3.1.2] - 2025-12-18
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 4dc5133..12651d1 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -2865,12 +2865,18 @@
LOGGER.finer("Processing entry (StAX): ${zipEntry.name}, foundry=$foundry")
val factory = xmlInputFactory.get()
val inputStream = zipFile.getInputStream(zipEntry)
- val filterReader = XMLCommentFilterReader(inputStream, "UTF-8")
+ val entryFileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
+ // For krill output and morpho.xml files, bypass XMLCommentFilterReader: large files (80+ MB)
+ // cause Xerces to fall back to the single-char read() which throws UnsupportedOperationException
+ val filterReader = if (outputFormat != OutputFormat.KRILL && entryFileName != "morpho.xml") {
+ XMLCommentFilterReader(inputStream, "UTF-8")
+ } else null
val reader = try {
- factory.createXMLStreamReader(filterReader)
+ if (filterReader != null) factory.createXMLStreamReader(filterReader)
+ else factory.createXMLStreamReader(inputStream, "UTF-8")
} catch (e: Exception) {
LOGGER.warning("Error creating StAX reader: " + zipEntry.name + " " + e.message)
- filterReader.close()
+ filterReader?.close() ?: inputStream.close()
return
}
@@ -2887,7 +2893,7 @@
if (docId == null) return
if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) return
- val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
+ val fileName = entryFileName
when (fileName) {
"data.xml" -> {
@@ -3005,7 +3011,8 @@
e.printStackTrace()
} finally {
try { reader.close() } catch (_: Exception) {}
- try { filterReader.close() } catch (_: Exception) {}
+ // closing filterReader also closes the underlying inputStream
+ try { filterReader?.close() ?: inputStream.close() } catch (_: Exception) {}
}
}