automatically gzip output for now, w2v, conllu target formats if -o ends with `.gz`
Change-Id: I2e9b90a51bd183bf02e6d18df65dcf40b4ef6b05
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 09c55f9..ff60c38 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
### Added
- integrated support for TreeTagger (`-T treetagger`) and spaCy (`-T spacy`) annotations
+- automatically gzip output for now, w2v, conllu target formats if output file (`-o` option) ends with `.gz`
## [v3.0.0] - 2025-11-27
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index d9113b1..15aaf2b 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -840,6 +840,14 @@
}
// Create parent directories
file.parentFile?.mkdirs()
+
+ // Initialize output writer with optional gzip compression
+ val outputStream = FileOutputStream(file)
+ textOutputWriter = if (finalOutputPath.endsWith(".gz")) {
+ BufferedWriter(OutputStreamWriter(GZIPOutputStream(outputStream), StandardCharsets.UTF_8))
+ } else {
+ BufferedWriter(OutputStreamWriter(outputStream, StandardCharsets.UTF_8))
+ }
}
LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", "))
@@ -1012,6 +1020,7 @@
var krillTarOutputStream: TarArchiveOutputStream? = null
var krillOutputFileName: String? = null
private var krillOutputPath: String? = null
+ private var textOutputWriter: BufferedWriter? = null
// Fast DocumentBuilderFactory without security features (safe for trusted input)
private val fastDomFactory: DocumentBuilderFactory by lazy {
@@ -1468,6 +1477,10 @@
// No external worker: ensure progress bar is closed (e.g., internal tagger -t)
progressBar?.close()
}
+
+ // Close text output writer if it was used
+ textOutputWriter?.close()
+
// Shutdown entry executor
entryExecutor?.shutdown()