Add more lemma only focused options
Change-Id: I1a8098babe397713f5acc7d1de0277cc786420d9
diff --git a/Readme.md b/Readme.md
index 2a3d6d4..a0c5f92 100644
--- a/Readme.md
+++ b/Readme.md
@@ -82,6 +82,27 @@
If a lemma for a token is missing (`_`) the surface form is used as fallback.
+### Lemma-only mode and I/O scheduling
+
+- `--lemma-only`: For `-f w2v` and `-f now`, skip loading `data.xml` and output only lemmas from `morpho.xml`. This reduces memory and speeds up throughput.
+- `--sequential`: Process entries inside each zip sequentially (zips can still run in parallel). Recommended for `w2v`/`now` to keep locality and lower memory.
+- `--zip-parallelism N`: Limit how many zips are processed concurrently (defaults to `--threads`). Helps avoid disk thrash and native inflater pressure.
+- `--exclude-zip-glob GLOB` (repeatable): Skip zip basenames that match the glob (e.g., `--exclude-zip-glob 'w?d24.tree_tagger.zip'`).
+
+Example for large NOW export with progress and exclusions:
+
+```
+java -Xmx64G -XX:+UseG1GC -Djdk.util.zip.disableMemoryMapping=true -Djdk.util.zip.reuseInflater=true \
+ -jar korapxmltool.jar -l info --threads 100 --zip-parallelism 8 \
+ --lemma-only --sequential -f now \
+ --exclude-zip-glob 'w?d24.tree_tagger.zip' \
+ /vol/corpora/DeReKo/current/KorAP/zip/*24.tree_tagger.zip | pv > dach2024.lemma.txt
+```
+
+At INFO level the tool logs:
+- The zip processing order with file sizes (largest-first in `--lemma-only`).
+- For each zip: start message including its size and a completion line with cumulative progress, ETA and average MB/s.
+
## Annotation
### Tagging with integrated MarMoT POS tagger directly to a new KorAP-XML ZIP file
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index df2c5a1..fc97baf 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -20,6 +20,7 @@
import java.util.concurrent.Callable
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.Executors
+import java.util.concurrent.atomic.AtomicLong
import java.util.logging.ConsoleHandler
import java.util.logging.Level
import java.util.logging.LogManager
@@ -128,6 +129,16 @@
}
@Option(
+ names = ["--exclude-zip-glob"],
+ paramLabel = "GLOB",
+ description = [
+ "Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').",
+ "May be repeated. Applied to basenames, not full paths."
+ ]
+ )
+ var excludeZipGlobs: MutableList<String> = mutableListOf()
+
+ @Option(
names = ["--token-separator", "-s"],
paramLabel = "STRING",
defaultValue = "\n",
@@ -170,17 +181,48 @@
}
@Option(
+ names = ["--zip-parallelism"],
+ paramLabel = "N",
+ description = ["Maximum number of zip files to process concurrently. Defaults to --threads."]
+ )
+ var zipParallelism: Int? = null
+
+ @Option(
+ names = ["--sequential"],
+ description = [
+ "Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now)."
+ ]
+ )
+ var sequentialInZip: Boolean = false
+
+ @Option(
names = ["--overwrite", "-o"],
description = ["Overwrite existing files"]
)
var overwrite: Boolean = false
@Option(
+ names = ["--mem-stats-interval"],
+ paramLabel = "N",
+ description = ["Log memory and cache statistics every N processed documents (0 disables; default: 0)"]
+ )
+ var memStatsInterval: Int = 0
+
+ @Option(
names = ["--lemma"],
description = ["In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)"]
)
var useLemma: Boolean = false
+ @Option(
+ names = ["--lemma-only"],
+ description = [
+ "Do not load texts from data.xml and output only lemmas (requires morpho.xml).",
+ "Only valid with -f word2vec or -f now; implies --lemma."
+ ]
+ )
+ var lemmaOnly: Boolean = false
+
private var taggerName: String? = null
private var taggerModel: String? = null
@Option(
@@ -248,6 +290,13 @@
Level.WARNING
}
+ if (lemmaOnly) {
+ useLemma = true
+ if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) {
+ throw ParameterException(spec.commandLine(), "--lemma-only is supported only with -f word2vec or -f now")
+ }
+ }
+
LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", "))
korapxml2conllu(zipFileNames!!)
@@ -265,9 +314,18 @@
val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap()
val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap()
+ private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0)
var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap()
var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap()
+ // Zip progress tracking for logging (zipNumber/zipTotal)
+ private val zipOrdinals: ConcurrentHashMap<String, Int> = ConcurrentHashMap()
+ private var totalZips: Int = 0
+ private val zipSizes: ConcurrentHashMap<String, Long> = ConcurrentHashMap()
+ private val processedZipBytes: AtomicLong = AtomicLong(0)
+ private var totalZipBytes: Long = 0
+ private var startTimeMillis: Long = 0
+
var dbFactory: DocumentBuilderFactory? = null
var dBuilder: DocumentBuilder? = null
var morphoZipOutputStream: ZipArchiveOutputStream? = null
@@ -296,12 +354,51 @@
}
var zips: Array<String> = args
+ if (excludeZipGlobs.isNotEmpty()) {
+ val before = zips.size
+ val patterns = excludeZipGlobs.map { globToRegex(it) }
+ zips = zips.filter { zipPath ->
+ val base = File(zipPath).name
+ patterns.none { rx -> rx.matches(base) }
+ }.toTypedArray()
+ val excluded = before - zips.size
+ if (excluded > 0) {
+ LOGGER.info("Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(", ")}")
+ }
+ }
+ // Initialize zip progress tracking and sizes
+ startTimeMillis = System.currentTimeMillis()
+ processedZipBytes.set(0)
+ totalZips = zips.size
+ zipOrdinals.clear()
+ zipSizes.clear()
+ zips.forEach { zip -> zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } }
+ totalZipBytes = zipSizes.values.sum()
+ // In lemma-only mode, process largest zips first
+ if (lemmaOnly) {
+ zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray()
+ }
+ zips.forEachIndexed { index, zip -> zipOrdinals[zip] = index + 1 }
+
+ // Log zip order with sizes so the user can verify sorting
+ val totalHuman = humanBytes(totalZipBytes)
+ LOGGER.info("Zip processing order (${zips.size} file(s), total ${totalHuman}):")
+ zips.forEachIndexed { idx, zip ->
+ val size = zipSizes[zip] ?: 0L
+ LOGGER.info(String.format(Locale.ROOT, "%d/%d: %s (%s)", idx + 1, zips.size, zip, humanBytes(size)))
+ }
+
+ if (sequentialInZip) {
+ if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) {
+ throw ParameterException(spec.commandLine(), "--sequential is supported only with -f word2vec or -f now")
+ }
+ }
if (maxThreads > 1) {
- LOGGER.info("Processing zip files in parallel with $maxThreads threads")
- Arrays.stream(zips).parallel().forEach { zipFilePath ->
- processZipFile((zipFilePath ?: "").toString(), getFoundryFromZipFileNames(zips))
- }
+ val foundry = getFoundryFromZipFileNames(zips)
+ val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)
+ LOGGER.info("Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) "sequential" else "parallel"}")
+ processZipsWithQueue(zips, foundry, parallelism)
} else {
LOGGER.info("Processing zip files sequentially")
Arrays.stream(zips).forEachOrdered { zipFilePath ->
@@ -315,6 +412,54 @@
}
}
+ private fun processZipsWithQueue(zips: Array<String>, foundry: String, parallelism: Int) {
+ val queue: java.util.concurrent.BlockingQueue<String> = java.util.concurrent.LinkedBlockingQueue()
+ zips.forEach { queue.put(it) }
+ val executor = Executors.newFixedThreadPool(parallelism)
+ val active = java.util.concurrent.atomic.AtomicInteger(0)
+ repeat(parallelism) {
+ executor.submit {
+ active.incrementAndGet()
+ try {
+ while (true) {
+ val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS)
+ if (zipPath == null) {
+ if (queue.isEmpty()) break else continue
+ }
+ if (sequentialInZip) {
+ processZipFileSequentially(zipPath, foundry)
+ } else {
+ processZipFile(zipPath, foundry)
+ }
+ }
+ } finally {
+ active.decrementAndGet()
+ }
+ }
+ }
+ executor.shutdown()
+ try {
+ executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS)
+ } catch (ie: InterruptedException) {
+ Thread.currentThread().interrupt()
+ }
+ }
+
+ // Convert a shell-like glob to a Regex: '*' -> ".*", '?' -> '.', anchored full match
+ private fun globToRegex(glob: String): Regex {
+ val sb = StringBuilder("^")
+ glob.forEach { ch ->
+ when (ch) {
+ '*' -> sb.append(".*")
+ '?' -> sb.append('.')
+ '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -> sb.append('\\').append(ch)
+ else -> sb.append(ch)
+ }
+ }
+ sb.append('$')
+ return Regex(sb.toString())
+ }
+
private fun getTokenSpansFromMorho(morpho: MutableMap<String, MorphoSpan>): Array<Span> {
return morpho.keys.map { key ->
@@ -343,7 +488,9 @@
}
private fun processZipFile(zipFilePath: String, foundry: String = "base") {
- LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
+ val ord = zipOrdinals[zipFilePath] ?: 0
+ val size = zipSizes[zipFilePath] ?: 0L
+ LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().id}")
LOGGER.info("Foundry: $foundry $dbFactory")
if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) {
var targetFoundry = "base"
@@ -392,29 +539,78 @@
if (outputFormat == OutputFormat.KORAPXML) {
morphoZipOutputStream!!.close()
}
+ logZipProgress(zipFilePath)
}
private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") {
- LOGGER.info("Processing ${zipFilePath} in thread ${Thread.currentThread().id}")
+ val ord = zipOrdinals[zipFilePath] ?: 0
+ val size = zipSizes[zipFilePath] ?: 0L
+ LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().id}")
if (zipFilePath.hasCorrespondingBaseZip()) {
+ // Process the two related zips strictly sequentially to limit memory growth
val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)
- Arrays.stream(zips).parallel().forEach { zip ->
+ zips.forEach { zip ->
ZipFile(zip).use { zipFile ->
- zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
- .parallel().forEach { zipEntry ->
+ // Iterate entries in a deterministic order to keep related files close together
+ zipFile.stream()
+ .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") }
+ .sorted(Comparator.comparing(ZipEntry::getName))
+ .forEachOrdered { zipEntry ->
processZipEntry(zipFile, foundry, zipEntry, true)
}
}
}
} else {
ZipFile(zipFilePath).use { zipFile ->
- zipFile.stream().filter({ extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") })
- //.sorted({ o1, o2 -> o1.name.compareTo(o2.name) })
- .forEachOrdered() { zipEntry ->
+ zipFile.stream()
+ .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") }
+ .sorted(Comparator.comparing(ZipEntry::getName))
+ .forEachOrdered { zipEntry ->
processZipEntry(zipFile, foundry, zipEntry, false)
}
}
}
+ logZipProgress(zipFilePath)
+ }
+
+ private fun logZipProgress(zipFilePath: String) {
+ try {
+ val size = zipSizes[zipFilePath] ?: 0L
+ val done = processedZipBytes.addAndGet(size)
+ val total = if (totalZipBytes > 0) totalZipBytes else 1L
+ val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1)
+ val speedBytesPerSec = (done * 1000.0) / elapsedMs
+ val remaining = (total - done).coerceAtLeast(0)
+ val etaSeconds = if (speedBytesPerSec > 0.0) (remaining / speedBytesPerSec).toLong() else -1L
+ val ord = zipOrdinals[zipFilePath] ?: 0
+ val pct = (done * 100.0 / total).coerceIn(0.0, 100.0)
+ val humanSpeed = String.format(Locale.ROOT, "%.2f MB/s", speedBytesPerSec / (1024.0 * 1024.0))
+ val etaStr = if (etaSeconds >= 0) formatDuration(etaSeconds) else "unknown"
+ LOGGER.info(
+ "Finished zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} " +
+ "(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, "%.1f", pct)}%%, " +
+ "ETA ${etaStr} at ${humanSpeed}"
+ )
+ } catch (e: Exception) {
+ LOGGER.fine("Failed to log zip progress for $zipFilePath: ${e.message}")
+ }
+ }
+
+ private fun humanBytes(bytes: Long): String {
+ if (bytes < 1024) return "$bytes B"
+ val kb = bytes / 1024.0
+ if (kb < 1024) return String.format(Locale.ROOT, "%.1f KB", kb)
+ val mb = kb / 1024.0
+ if (mb < 1024) return String.format(Locale.ROOT, "%.1f MB", mb)
+ val gb = mb / 1024.0
+ return String.format(Locale.ROOT, "%.1f GB", gb)
+ }
+
+ private fun formatDuration(seconds: Long): String {
+ var s = seconds
+ val h = s / 3600; s %= 3600
+ val m = s / 60; val sec = s % 60
+ return String.format(Locale.ROOT, "%02d:%02d:%02d", h, m, sec)
}
fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {
@@ -440,12 +636,19 @@
try {
if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) {
- val inputStream: InputStream = zipFile.getInputStream(zipEntry)
+ // Ensure the entry stream and reader are closed to avoid native memory buildup
val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()
val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()
-
+ // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure
+ if (lemmaOnly && zipEntry.name.endsWith("data.xml")) {
+ return
+ }
val doc: Document = try {
- dBuilder.parse(InputSource(XMLCommentFilterReader(inputStream, "UTF-8")))
+ zipFile.getInputStream(zipEntry).use { inputStream ->
+ XMLCommentFilterReader(inputStream, "UTF-8").use { reader ->
+ dBuilder.parse(InputSource(reader))
+ }
+ }
} catch (e: SAXParseException) {
LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message)
return
@@ -460,9 +663,11 @@
val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
when (fileName) {
"data.xml" -> {
- val textsList: NodeList = doc.getElementsByTagName("text")
- if (textsList.length > 0) {
- texts[docId] = NonBmpString(textsList.item(0).textContent)
+ if (!lemmaOnly) {
+ val textsList: NodeList = doc.getElementsByTagName("text")
+ if (textsList.length > 0) {
+ texts[docId] = NonBmpString(textsList.item(0).textContent)
+ }
}
}
@@ -491,11 +696,18 @@
}
}
- if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
- && (!waitForMorpho || morpho[docId] != null)
+ val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML
+ // For lemma-only/lemma-based word2vec/now, we can proceed without full text
+ val textRequired = when (outputFormat) {
+ OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly)
+ else -> true
+ }
+ if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null
+ && (!morphoRequired || morpho[docId] != null)
&& (extractMetadataRegex.isEmpty() || metadata[docId] != null)
) {
- LOGGER.info("Processing text: $docId in thread ${Thread.currentThread().id}")
+ // Be quiet on INFO; per-text logs only on FINE and below
+ LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().id}")
processText(docId, foundry)
}
} else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) {
@@ -504,7 +716,7 @@
val docId =
Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1()
?.replace(Regex("/"), "_")
- LOGGER.info("Processing header file: " + zipEntry.name + " docId: " + docId)
+ LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId)
val meta = ArrayList<String>()
extractMetadataRegex.forEach { regex ->
val match = Regex(regex).find(text)
@@ -514,9 +726,16 @@
}
if (meta.isNotEmpty() && docId != null) {
metadata[docId] = meta.toTypedArray()
- if (texts[docId] != null && sentences[docId] != null && tokens[docId] != null
- && (!waitForMorpho || morpho[docId] != null)
- ) {
+ val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML
+ val textRequired = when (outputFormat) {
+ OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly)
+ else -> true
+ }
+ if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null
+ && (!morphoRequired || morpho[docId] != null)
+ ) {
+ // Be quiet on INFO; per-text logs only on FINE and below
+ LOGGER.fine("Processing text (meta-ready): $docId in thread ${Thread.currentThread().id}")
processText(docId, foundry)
}
}
@@ -569,19 +788,40 @@
if (annotationWorkerPool != null) {
annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString())
+ // Release internal char[] early
+ output.setLength(0)
} else if (outputFormat != OutputFormat.KORAPXML) {
synchronized(System.out) {
println(output.toString())
}
+ // Release internal char[] early
+ output.setLength(0)
} else {
korapXmlOutput(foundry, docId)
}
arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map ->
+ if (map === morpho) {
+ // Clear inner map to release references early
+ morpho[docId]?.clear()
+ }
map.remove(docId)
}
+ // Periodic GC hint after processing many docs (lightweight safeguard)
+ if ((processedDocs.incrementAndGet() % 2000) == 0) {
+ LOGGER.fine("Processed ${processedDocs.get()} docs – requesting GC hint")
+ System.gc()
+ }
+ // Memory / cache statistics logging
+ if (memStatsInterval > 0) {
+ val count = processedDocs.get()
+ if (count % memStatsInterval == 0) {
+ logMemoryStats(count)
+ }
+ }
+
if (outputFormat == OutputFormat.KORAPXML) {
val entryPath = if (parserName != null) docId.replace(Regex("[_.]"), "/").plus("/$parserName/").plus("dependency.xml")
else
@@ -599,6 +839,21 @@
private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().id]?.foundry ?: "base"
+ private fun logMemoryStats(count: Int) {
+ try {
+ val rt = Runtime.getRuntime()
+ val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024)
+ val total = rt.totalMemory() / (1024 * 1024)
+ val max = rt.maxMemory() / (1024 * 1024)
+ LOGGER.info(
+ "MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} " +
+ "maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}"
+ )
+ } catch (e: Exception) {
+ LOGGER.warning("Failed to log memory stats: ${e.message}")
+ }
+ }
+
private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {
val doc: Document = dBuilder!!.newDocument()
@@ -849,7 +1104,15 @@
if (extractMetadataRegex.isNotEmpty()) {
output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
}
+ // If no text is available (e.g., lemma-only mode), emit lemmas
if (texts[docId] == null) {
+ tokens[docId]?.forEach { span ->
+ if (span == null) return@forEach
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = morpho[docId]?.get(key)?.lemma
+ output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
+ }
+ if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
return output
}
tokens[docId]?.forEach { span ->
@@ -873,12 +1136,15 @@
val key = "${span.from}-${span.to}"
val lemmaVal = morpho[docId]!![key]?.lemma
if (lemmaVal != null && lemmaVal != "_") {
- output.append(lemmaVal, " ")
+ output.append(lemmaVal)
+ output.append(' ')
} else {
- output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
+ texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
}
} else {
- output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
+ texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
}
real_token_index++
}
@@ -898,6 +1164,22 @@
output.append("@@$docId ")
if (texts[docId] == null) {
+ // Lemma-only fallback when original text is not loaded
+ tokens[docId]?.forEach { span ->
+ if (span == null) return@forEach
+ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
+ if (output.isNotEmpty() && !output.endsWith("@@$docId ")) {
+ output.append(" <p> ")
+ }
+ sentence_index++
+ }
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = morpho[docId]?.get(key)?.lemma
+ output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
+ }
+ if (output.isNotEmpty() && output.endsWith(" ")) {
+ output.deleteCharAt(output.length - 1)
+ }
return output
}
@@ -918,12 +1200,15 @@
val key = "${span.from}-${span.to}"
val lemmaVal = morpho[docId]!![key]?.lemma
if (lemmaVal != null && lemmaVal != "_") {
- output.append(lemmaVal, " ")
+ output.append(lemmaVal)
+ output.append(' ')
} else {
- output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
+ texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
}
} else {
- output.append(texts[docId]!!.substring(safeFrom, safeTo), " ")
+ texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
}
real_token_index++
}
@@ -1138,5 +1423,3 @@
object NowOutputFormat {
const val NAME = "now"
}
-
-
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
index 44c7282..6dd7924 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/NonBmpString.kt
@@ -48,6 +48,23 @@
return stringBuilder.toString()
}
+ fun appendRangeTo(sb: StringBuilder, startIndex: Int, endIndex: Int) {
+ if (startIndex < 0 || endIndex > length || startIndex > endIndex) {
+ throw IndexOutOfBoundsException("Invalid range $startIndex..$endIndex for NonBmpString length $length")
+ }
+ var i = startIndex
+ while (i < endIndex) {
+ val cp = utf32Chars[i]
+ if (Character.isBmpCodePoint(cp)) {
+ sb.append(cp.toChar())
+ } else {
+ sb.append(Character.highSurrogate(cp))
+ sb.append(Character.lowSurrogate(cp))
+ }
+ i++
+ }
+ }
+
private fun String.toUtf32Array(): IntArray {
val codePoints = IntArray(Character.codePointCount(this, 0, length))
var index = 0
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 55eaf25..4aa98ac 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -314,4 +314,42 @@
assertContains(out, " <p> ")
assertContains(out, " mein Ankunft ")
}
+
+ @Test
+ fun lemmaOnlyWord2VecWorks() {
+ val args = arrayOf("--lemma-only", "-f", "w2v", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ // Should produce some lemma tokens without requiring data.xml
+ assertTrue(out.contains(" mein ") || out.contains(" Ankunft "))
+ }
+
+ @Test
+ fun lemmaOnlyNowWorks() {
+ val args = arrayOf("--lemma-only", "-f", "now", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, "@@")
+ assertContains(out, " <p> ")
+ }
+
+ @Test
+ fun excludeZipGlobSkipsFiles() {
+ val args = arrayOf("--exclude-zip-glob", "goe.zip", loadResource("wdf19.zip").path, loadResource("goe.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ // Expect French content, but not the German token from GOE
+ assertContains(out, "automatique")
+ assertFalse(out.contains("Gedanken"))
+ }
+
+ @Test
+ fun sequentialOnlyForNowAndW2V() {
+ val args = arrayOf("--sequential", loadResource("wdf19.zip").path)
+ // Default format is conllu; this should error
+ val rc = debug(args)
+ // Non-zero is expected; and error message should be present
+ assertTrue(rc != 0)
+ assertContains(errContent.toString(), "--sequential is supported only with -f word2vec or -f now")
+ }
}