.idea/copilotDiffState.xml - KorAP/korapxml2conllu - Gitiles

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="CopilotDiffPersistence">
     <option name="pendingDiffs">
       <map>
         <entry key="$PROJECT_DIR$/app/build.gradle">
           <value>
             <PendingDiffInfo>
               <option name="filePath" value="$PROJECT_DIR$/app/build.gradle" />
               <option name="originalContent" value="plugins {&#10;    // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.&#10;    id 'org.jetbrains.kotlin.jvm' version '2.2.21'&#10;&#10;    // Apply the application plugin to add support for building a CLI application in Java.&#10;    id 'application'&#10;    id 'com.github.johnrengelman.shadow' version '8.1.1'&#10;}&#10;&#10;repositories {&#10;    mavenCentral()&#10;    maven { url 'https://jitpack.io' }&#10;}&#10;&#10;test {&#10;    minHeapSize = &quot;512m&quot;&#10;    maxHeapSize = &quot;4096m&quot;&#10;    jvmArgs '-XX:MaxMetaspaceSize=1024m'&#10;}&#10;&#10;dependencies {&#10;    // Align versions of all Kotlin components&#10;    implementation platform('org.jetbrains.kotlin:kotlin-bom')&#10;&#10;    // Use the Kotlin JDK 8 standard library.&#10;    implementation 'org.jetbrains.kotlin:kotlin-stdlib'&#10;    implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2'&#10;&#10;    // This dependency is used by the application.&#10;    implementation 'com.google.guava:guava:33.5.0-jre'&#10;&#10;&#10;    implementation (&quot;info.picocli:picocli:4.7.7&quot;)&#10;&#10;    // Use the Kotlin test library.&#10;    testImplementation 'org.jetbrains.kotlin:kotlin-test'&#10;&#10;    // Use the Kotlin JUnit integration.&#10;    testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'&#10;    testImplementation &quot;org.jetbrains.kotlin:kotlin-test:2.2.21&quot;&#10;&#10;    implementation 'com.github.kupietz:cistern:v1.0.4'&#10;    implementation 'org.maltparser:maltparser:1.9.2'&#10;    implementation 'org.apache.opennlp:opennlp-tools:2.5.6'&#10;    implementation 'org.slf4j:slf4j-simple:2.0.17'&#10;    implementation 'org.apache.ant:ant:1.10.15'&#10;    implementation 'org.apache.commons:commons-compress:1.28.0'&#10;&#10;}&#10;&#10;// Erzwinge JDK 21 Toolchain und Bytecode-Level 21&#10;java {&#10;    toolchain {&#10;        languageVersion = JavaLanguageVersion.of(21)&#10;    }&#10;}&#10;&#10;kotlin {&#10;    jvmToolchain(21)&#10;}&#10;&#10;// Für evtl. vorhandenen Java-Quellcode&#10;tasks.withType(JavaCompile).configureEach {&#10;    options.release = 21&#10;}&#10;&#10;tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach {&#10;    kotlinOptions {&#10;        jvmTarget = &quot;21&quot;&#10;        // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release&#10;        // freeCompilerArgs += [&quot;-Xjdk-release=21&quot;]&#10;    }&#10;}&#10;&#10;application {&#10;    // Define the main class for the application.&#10;    mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt'&#10;}&#10;&#10;jar {&#10;    // Will include every single one of your dependencies, project or not&#10;    // def lowerCasedName = baseName.toLowerCase()&#10;    // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)&#10;&#10;    manifest.attributes(&#10;            'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),&#10;            'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;&#10;    )&#10;    shadowJar {&#10;        archiveBaseName.set('korapxmltool')&#10;        archiveClassifier.set('')&#10;        archiveVersion.set('')&#10;    }&#10;}&#10;&#10;&#10;configurations {&#10;    runtimeLib.extendsFrom implementation&#10;}&#10;" />
               <option name="updatedContent" value="plugins {&#10;    // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.&#10;    id 'org.jetbrains.kotlin.jvm' version '2.2.21'&#10;&#10;    // Apply the application plugin to add support for building a CLI application in Java.&#10;    id 'application'&#10;    id 'com.github.johnrengelman.shadow' version '8.1.1'&#10;}&#10;&#10;repositories {&#10;    mavenCentral()&#10;    maven { url 'https://jitpack.io' }&#10;}&#10;&#10;test {&#10;    minHeapSize = &quot;512m&quot;&#10;    maxHeapSize = &quot;4096m&quot;&#10;    jvmArgs '-XX:MaxMetaspaceSize=1024m'&#10;}&#10;&#10;dependencies {&#10;    // Align versions of all Kotlin components&#10;    implementation platform('org.jetbrains.kotlin:kotlin-bom')&#10;&#10;    // Use the Kotlin JDK 8 standard library.&#10;    implementation 'org.jetbrains.kotlin:kotlin-stdlib'&#10;    implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2'&#10;&#10;    // This dependency is used by the application.&#10;    implementation 'com.google.guava:guava:33.5.0-jre'&#10;&#10;&#10;    implementation (&quot;info.picocli:picocli:4.7.7&quot;)&#10;&#10;    // Use the Kotlin test library.&#10;    testImplementation 'org.jetbrains.kotlin:kotlin-test'&#10;&#10;    // Use the Kotlin JUnit integration.&#10;    testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'&#10;    testImplementation &quot;org.jetbrains.kotlin:kotlin-test:2.2.21&quot;&#10;&#10;    implementation 'com.github.kupietz:cistern:v1.0.4'&#10;    implementation 'org.maltparser:maltparser:1.9.2'&#10;    implementation 'org.apache.opennlp:opennlp-tools:2.5.6'&#10;    implementation 'org.slf4j:slf4j-simple:2.0.17'&#10;    implementation 'org.apache.ant:ant:1.10.15'&#10;    implementation 'org.apache.commons:commons-compress:1.28.0'&#10;&#10;}&#10;&#10;// Erzwinge JDK 21 Toolchain und Bytecode-Level 21&#10;java {&#10;    toolchain {&#10;        languageVersion = JavaLanguageVersion.of(21)&#10;    }&#10;}&#10;&#10;kotlin {&#10;    jvmToolchain(21)&#10;}&#10;&#10;// Für evtl. vorhandenen Java-Quellcode&#10;tasks.withType(JavaCompile).configureEach {&#10;    options.release = 21&#10;}&#10;&#10;tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach {&#10;    kotlinOptions {&#10;        jvmTarget = &quot;21&quot;&#10;        // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release&#10;        // freeCompilerArgs += [&quot;-Xjdk-release=21&quot;]&#10;    }&#10;}&#10;&#10;application {&#10;    // Define the main class for the application.&#10;    mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt'&#10;}&#10;&#10;jar {&#10;    // Will include every single one of your dependencies, project or not&#10;    // def lowerCasedName = baseName.toLowerCase()&#10;    // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)&#10;&#10;    manifest.attributes(&#10;            'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),&#10;            'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;,&#10;            'Implementation-Title': rootProject.name,&#10;            'Implementation-Version': project.version&#10;    )&#10;    shadowJar {&#10;        archiveBaseName.set('korapxmltool')&#10;        archiveClassifier.set('')&#10;        // Version ins Dateinamen aufnehmen&#10;        archiveVersion.set(project.version.toString())&#10;        manifest.attributes(&#10;            'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;,&#10;            'Implementation-Title': rootProject.name,&#10;            'Implementation-Version': project.version&#10;        )&#10;    }&#10;}&#10;&#10;&#10;configurations {&#10;    runtimeLib.extendsFrom implementation&#10;}" />
             </PendingDiffInfo>
           </value>
         </entry>
         <entry key="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt">
           <value>
             <PendingDiffInfo>
               <option name="filePath" value="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt" />
               <option name="originalContent" value="package de.ids_mannheim.korapxmltools&#10;&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries&#10;import org.apache.commons.compress.archivers.zip.Zip64Mode&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveEntry&#10;import org.w3c.dom.Document&#10;import org.w3c.dom.Element&#10;import org.w3c.dom.NodeList&#10;import org.xml.sax.InputSource&#10;import org.xml.sax.SAXParseException&#10;import picocli.CommandLine&#10;import picocli.CommandLine.*&#10;import java.io.File&#10;import java.io.FileOutputStream&#10;import java.io.InputStream&#10;import java.io.StringWriter&#10;import java.lang.Integer.parseInt&#10;import java.util.*&#10;import java.util.concurrent.Callable&#10;import java.util.concurrent.ConcurrentHashMap&#10;import java.util.concurrent.Executors&#10;import java.util.concurrent.atomic.AtomicLong&#10;import java.util.logging.ConsoleHandler&#10;import java.util.logging.Level&#10;import java.util.logging.LogManager&#10;import java.util.logging.Logger&#10;import java.util.regex.Matcher&#10;import java.util.regex.Pattern&#10;import java.util.stream.IntStream&#10;import java.util.zip.ZipEntry&#10;&#10;import java.util.zip.ZipFile&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream&#10;import javax.xml.parsers.DocumentBuilder&#10;import javax.xml.parsers.DocumentBuilderFactory&#10;import javax.xml.transform.OutputKeys&#10;import javax.xml.transform.TransformerFactory&#10;import javax.xml.transform.dom.DOMSource&#10;import javax.xml.transform.stream.StreamResult&#10;import kotlin.math.min&#10;import kotlin.system.exitProcess&#10;&#10;val ZIP_ENTRY_UNIX_MODE = parseInt(&quot;644&quot;, 8)&#10;&#10;@Command(&#10;    name = &quot;KorapXmlTool&quot;,&#10;    mixinStandardHelpOptions = true,&#10;    version = [&quot;KorapXmlTool 2.0-beta-01&quot;],&#10;    description = [&quot;Converts KorAP-XML &lt;https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml&gt; base or &quot; +&#10;            &quot;morpho zips to (annotated) CoNLL(-U) format with all information necessary for &quot; +&#10;            &quot;reconstruction in comment lines.&quot;]&#10;)&#10;&#10;class KorapXmlTool : Callable&lt;Int&gt; {&#10;    val COMPATIBILITY_MODE = System.getenv(&quot;COMPATIBILITY_MODE&quot;) != null&#10;&#10;    @Spec lateinit var spec : Model.CommandSpec&#10;&#10;    @Parameters(arity = &quot;1..*&quot;, description = [&quot;At least one zip file name&quot;])&#10;    var zipFileNames: Array&lt;String&gt;? = null&#10;&#10;    @Option(&#10;        names = [&quot;-f&quot;, &quot;--output-format&quot;],&#10;        description = [&quot;Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}&quot;,&#10;            &quot;conllu: CoNLL-U format&quot;,&#10;            &quot;korapxml, xml, zip: KorAP-XML format zip&quot;,&#10;            &quot;word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines&quot;,&#10;            &quot;now, NOW: NOW corpus export format: w2v-like format with &lt;p&gt; tags for sentence ends and @@&lt;text-sigle&gt; prefix&quot;,&#10;        ],&#10;        converter = [OutputFormatConverter::class]&#10;    )&#10;    var outputFormat: OutputFormat = OutputFormat.CONLLU&#10;    class OutputFormatConverter : ITypeConverter&lt;OutputFormat&gt; {&#10;        override fun convert(value: String?): OutputFormat {&#10;            return when (value?.lowercase(Locale.getDefault())) {&#10;                &quot;conllu&quot;, &quot;conll&quot; -&gt; OutputFormat.CONLLU&#10;                &quot;word2vec&quot;, &quot;w2v&quot; -&gt; OutputFormat.WORD2VEC&#10;                &quot;korapxml&quot;, &quot;korap&quot;, &quot;xml&quot;, &quot;zip&quot; -&gt; OutputFormat.KORAPXML&#10;                &quot;now&quot;, &quot;NOW&quot; -&gt; OutputFormat.NOW&#10;                else -&gt; throw IllegalArgumentException(&quot;Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(&quot;, &quot;) { it.name }}&quot;)&#10;            }&#10;        }&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--sigle-pattern&quot;, &quot;-p&quot;],&#10;        paramLabel = &quot;PATTERN&quot;,&#10;        description = [&quot;Extract only documents with sigle matching the pattern (regex)&quot;]&#10;    )&#10;    var siglePattern: String? = null&#10;&#10;    @Option(&#10;        names = [&quot;--extract-attributes-regex&quot;, &quot;-e&quot;],&#10;        paramLabel = &quot;REGEX&quot;,&#10;        description = [&quot;Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.&quot;,&#10;            &quot;Example: -e '(posting/id|div/id)'&quot;]&#10;    )&#10;    var extractAttributesRegex: String = &quot;&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--s-bounds-from-morpho&quot;], description = [&quot;Not yet implemented: s bounds from morpho&quot;]&#10;    )&#10;    var sBoundsFromMorpho: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--log&quot;, &quot;-l&quot;],&#10;        paramLabel = &quot;LEVEL&quot;,&#10;        description = [&quot;Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${&quot;$&quot;}{DEFAULT-VALUE}])&quot;]&#10;    )&#10;    var logLevel: String = &quot;WARNING&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--columns&quot;, &quot;-c&quot;],&#10;        paramLabel = &quot;NUMBER&quot;,&#10;        description = [&quot;Number of columns. 1 means just the token. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;, &quot;Possible values: 1-10&quot;]&#10;    )&#10;    var columns: Int = 10&#10;&#10;    @Option(&#10;        names = [&quot;--word2vec&quot;, &quot;-w&quot;],&#10;        description = [&quot;Print text in LM training format: tokens separated by space, sentences separated by newline&quot;,&#10;            &quot;Deprecated: use -f word2vec&quot;]&#10;    )&#10;    fun setWord2Vec(word2vec: Boolean) {&#10;        if (word2vec) {&#10;            outputFormat = OutputFormat.WORD2VEC&#10;        }&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--exclude-zip-glob&quot;],&#10;        paramLabel = &quot;GLOB&quot;,&#10;        description = [&#10;            &quot;Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').&quot;,&#10;            &quot;May be repeated. Applied to basenames, not full paths.&quot;&#10;        ]&#10;    )&#10;    var excludeZipGlobs: MutableList&lt;String&gt; = mutableListOf()&#10;&#10;    @Option(&#10;        names = [&quot;--token-separator&quot;, &quot;-s&quot;],&#10;        paramLabel = &quot;STRING&quot;,&#10;        defaultValue = &quot;\n&quot;,&#10;        description = [&quot;Token separator. Default: new-line for CoNLL-U, space for word2vec format.&quot;]&#10;    )&#10;    var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) &quot; &quot; else &quot;\n&quot;&#10;&#10;    @Option(names = [&quot;--offsets&quot;], description = [&quot;Not yet implemented: offsets&quot;])&#10;    var offsets: Boolean = false&#10;&#10;    @Option(names = [&quot;--comments&quot;, &quot;-C&quot;], description = [&quot;Not yet implemented: comments&quot;])&#10;    var comments: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--extract-metadata-regex&quot;, &quot;-m&quot;],&#10;        paramLabel = &quot;REGEX&quot;,&#10;        description = [&quot;Extract metadata regexes.\nExample: -m '&lt;textSigle&gt;([^&lt;]+)' -m '&lt;creatDate&gt;([^&lt;]+)'&quot;]&#10;    )&#10;    var extractMetadataRegex: MutableList&lt;String&gt; = mutableListOf()&#10;&#10;    @Option(&#10;        names = [&quot;--annotate-with&quot;, &quot;-A&quot;],&#10;        paramLabel = &quot;COMMAND&quot;,&#10;        description = [&quot;Pipe output through command&quot;]&#10;    )&#10;    var annotateWith: String = &quot;&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--threads&quot;, &quot;-T&quot;],&#10;        paramLabel = &quot;THREADS&quot;,&#10;        description = [&quot;Maximum number of threads to use. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;]&#10;    )&#10;    var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2&#10;    fun setThreads(threads: Int) {&#10;        if (threads &lt; 1) {&#10;            throw ParameterException(spec.commandLine(), String.format(&quot;Invalid value `%d' for option '--threads': must be at least 1&quot;, threads))&#10;        }&#10;        this.maxThreads = threads&#10;        System.setProperty(&quot;java.util.concurrent.ForkJoinPool.common.parallelism&quot;, threads.toString())&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--zip-parallelism&quot;],&#10;        paramLabel = &quot;N&quot;,&#10;        description = [&quot;Maximum number of zip files to process concurrently. Defaults to --threads.&quot;]&#10;    )&#10;    var zipParallelism: Int? = null&#10;&#10;    @Option(&#10;        names = [&quot;--sequential&quot;],&#10;        description = [&#10;            &quot;Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now).&quot;&#10;        ]&#10;    )&#10;    var sequentialInZip: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--overwrite&quot;, &quot;-o&quot;],&#10;        description = [&quot;Overwrite existing files&quot;]&#10;    )&#10;    var overwrite: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--mem-stats-interval&quot;],&#10;        paramLabel = &quot;N&quot;,&#10;        description = [&quot;Log memory and cache statistics every N processed documents (0 disables; default: 0)&quot;]&#10;    )&#10;    var memStatsInterval: Int = 0&#10;&#10;    @Option(&#10;        names = [&quot;--lemma&quot;],&#10;        description = [&quot;In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)&quot;]&#10;    )&#10;    var useLemma: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--lemma-only&quot;],&#10;        description = [&#10;            &quot;Do not load texts from data.xml and output only lemmas (requires morpho.xml).&quot;,&#10;            &quot;Only valid with -f word2vec or -f now; implies --lemma.&quot;&#10;        ]&#10;    )&#10;    var lemmaOnly: Boolean = false&#10;&#10;    private var taggerName: String? = null&#10;    private var taggerModel: String? = null&#10;    @Option(&#10;        names = [&quot;--tag-with&quot;, &quot;-t&quot;],&#10;        paramLabel = &quot;TAGGER:MODEL&quot;,&#10;        description = [&quot;Specify a tagger and a model: ${taggerFoundries}:&lt;path/to/model&gt;.&quot;]&#10;    )&#10;    fun setTagWith(tagWith: String) {&#10;        val pattern: Pattern = Pattern.compile(&quot;(${taggerFoundries}):(.+)&quot;)&#10;        val matcher: Matcher = pattern.matcher(tagWith)&#10;        if (!matcher.matches()) {&#10;            throw ParameterException(spec.commandLine(),&#10;                String.format(&quot;Invalid value `%s' for option '--tag-with': &quot;+&#10;                    &quot;value does not match the expected pattern ${taggerFoundries}:&lt;path/to/model&gt;&quot;, tagWith))&#10;        } else {&#10;            taggerName = matcher.group(1)&#10;            taggerModel = matcher.group(2)&#10;            if (!File(taggerModel).exists()) {&#10;                throw ParameterException(spec.commandLine(),&#10;                    String.format(&quot;Invalid value for option '--tag-with':&quot;+&#10;                        &quot;model file '%s' does not exist&quot;, taggerModel, taggerModel))&#10;            }&#10;        }&#10;    }&#10;&#10;    private var parserName: String? = null&#10;    private var parserModel: String? = null&#10;    @Option(&#10;        names = [&quot;--parse-with&quot;, &quot;-P&quot;],&#10;        paramLabel = &quot;parser:MODEL&quot;,&#10;        description = [&quot;Specify a parser and a model: ${parserFoundries}:&lt;path/to/model&gt;.&quot;]&#10;    )&#10;    fun setParseWith(parseWith: String) {&#10;        val pattern: Pattern = Pattern.compile(&quot;(${parserFoundries}):(.+)&quot;)&#10;        val matcher: Matcher = pattern.matcher(parseWith)&#10;        if (!matcher.matches()) {&#10;            throw ParameterException(spec.commandLine(),&#10;                String.format(&quot;Invalid value `%s' for option '--parse-with': &quot;+&#10;                        &quot;value does not match the expected pattern (${parserFoundries}):&lt;path/to/model&gt;&quot;, parseWith))&#10;        } else {&#10;            parserName = matcher.group(1)&#10;            parserModel = matcher.group(2)&#10;            if (!File(parserModel).exists()) {&#10;                throw ParameterException(spec.commandLine(),&#10;                    String.format(&quot;Invalid value for option '--parse-with':&quot;+&#10;                            &quot;model file '%s' does not exist&quot;, parserModel, parserModel))&#10;            }&#10;        }&#10;    }&#10;&#10;&#10;    override fun call(): Int {&#10;        val handler = ConsoleHandler()&#10;        LogManager.getLogManager().reset()&#10;        handler.formatter = ColoredFormatter()&#10;&#10;        for (handler in LOGGER.handlers) {&#10;            LOGGER.removeHandler(handler)&#10;        }&#10;        LOGGER.addHandler(handler)&#10;        LOGGER.level = try {&#10;            Level.parse(logLevel.uppercase(Locale.getDefault()))&#10;        } catch (e: IllegalArgumentException) {&#10;            LOGGER.warning(&quot;Invalid log level: $logLevel. Defaulting to WARNING.&quot;)&#10;            Level.WARNING&#10;        }&#10;&#10;        if (lemmaOnly) {&#10;            useLemma = true&#10;            if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10;                throw ParameterException(spec.commandLine(), &quot;--lemma-only is supported only with -f word2vec or -f now&quot;)&#10;            }&#10;        }&#10;&#10;        LOGGER.info(&quot;Processing zip files: &quot; + zipFileNames!!.joinToString(&quot;, &quot;))&#10;&#10;        korapxml2conllu(zipFileNames!!)&#10;        return 0&#10;    }&#10;&#10;    private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name)&#10;&#10;    private var annotationWorkerPool : AnnotationWorkerPool? = null&#10;    // Shared executor for entry-level parallelism across all zips&#10;    private var entryExecutor: java.util.concurrent.ExecutorService? = null&#10;&#10;    val texts: ConcurrentHashMap&lt;String, NonBmpString&gt; = ConcurrentHashMap()&#10;    val sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10;    val tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10;    val morpho: ConcurrentHashMap&lt;String, MutableMap&lt;String, MorphoSpan&gt;&gt; = ConcurrentHashMap()&#10;    val fnames: ConcurrentHashMap&lt;String, String&gt; = ConcurrentHashMap()&#10;    val metadata: ConcurrentHashMap&lt;String, Array&lt;String&gt;&gt; = ConcurrentHashMap()&#10;    val extraFeatures: ConcurrentHashMap&lt;String, MutableMap&lt;String, String&gt;&gt; = ConcurrentHashMap()&#10;    private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0)&#10;    var taggerToolBridges: ConcurrentHashMap&lt;Long, TaggerToolBridge?&gt; = ConcurrentHashMap()&#10;    var parserToolBridges: ConcurrentHashMap&lt;Long, ParserToolBridge?&gt; = ConcurrentHashMap()&#10;&#10;    // Zip progress tracking for logging (zipNumber/zipTotal)&#10;    private val zipOrdinals: ConcurrentHashMap&lt;String, Int&gt; = ConcurrentHashMap()&#10;    private var totalZips: Int = 0&#10;    private val zipSizes: ConcurrentHashMap&lt;String, Long&gt; = ConcurrentHashMap()&#10;    private val processedZipBytes: AtomicLong = AtomicLong(0)&#10;    private var totalZipBytes: Long = 0&#10;    private var startTimeMillis: Long = 0&#10;&#10;    var dbFactory: DocumentBuilderFactory? = null&#10;    var dBuilder: DocumentBuilder? = null&#10;    var morphoZipOutputStream: ZipArchiveOutputStream? = null&#10;&#10;    fun String.hasCorrespondingBaseZip(): Boolean {&#10;        if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return false&#10;        val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10;        return File(baseZip).exists()&#10;    }&#10;&#10;    fun String.correspondingBaseZip(): String? {&#10;        if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return null&#10;        val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10;        return if (File(baseZip).exists()) baseZip else null&#10;    }&#10;&#10;    fun korapxml2conllu(args: Array&lt;String&gt;) {&#10;        if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotateWith.isNotEmpty()) {&#10;            LOGGER.severe(&quot;Shell command annotation is not yet supported with output format $outputFormat&quot;)&#10;            exitProcess(1)&#10;        }&#10;        // Initialize shared entry executor (used inside each zip)&#10;        entryExecutor = Executors.newFixedThreadPool(maxThreads)&#10;&#10;        if (annotateWith.isNotEmpty()) {&#10;            annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)&#10;        }&#10;&#10;        var zips: Array&lt;String&gt; = args&#10;        if (excludeZipGlobs.isNotEmpty()) {&#10;            val before = zips.size&#10;            val patterns = excludeZipGlobs.map { globToRegex(it) }&#10;            zips = zips.filter { zipPath -&gt;&#10;                val base = File(zipPath).name&#10;                patterns.none { rx -&gt; rx.matches(base) }&#10;            }.toTypedArray()&#10;            val excluded = before - zips.size&#10;            if (excluded &gt; 0) {&#10;                LOGGER.info(&quot;Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(&quot;, &quot;)}&quot;)&#10;            }&#10;        }&#10;        // Initialize zip progress tracking and sizes&#10;        startTimeMillis = System.currentTimeMillis()&#10;        processedZipBytes.set(0)&#10;        totalZips = zips.size&#10;        zipOrdinals.clear()&#10;        zipSizes.clear()&#10;        zips.forEach { zip -&gt; zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } }&#10;        totalZipBytes = zipSizes.values.sum()&#10;        // In lemma-only mode, process largest zips first&#10;        if (lemmaOnly) {&#10;            zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray()&#10;        }&#10;        zips.forEachIndexed { index, zip -&gt; zipOrdinals[zip] = index + 1 }&#10;&#10;        // Log zip order with sizes so the user can verify sorting&#10;        val totalHuman = humanBytes(totalZipBytes)&#10;        LOGGER.info(&quot;Zip processing order (${zips.size} file(s), total ${totalHuman}):&quot;)&#10;        zips.forEachIndexed { idx, zip -&gt;&#10;            val size = zipSizes[zip] ?: 0L&#10;            LOGGER.info(String.format(Locale.ROOT, &quot;%d/%d: %s (%s)&quot;, idx + 1, zips.size, zip, humanBytes(size)))&#10;        }&#10;&#10;        if (sequentialInZip) {&#10;            if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10;                throw ParameterException(spec.commandLine(), &quot;--sequential is supported only with -f word2vec or -f now&quot;)&#10;            }&#10;        }&#10;&#10;        if (maxThreads &gt; 1) {&#10;            val foundry = getFoundryFromZipFileNames(zips)&#10;            val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)&#10;            LOGGER.info(&quot;Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) &quot;sequential&quot; else &quot;parallel&quot;}&quot;)&#10;            processZipsWithQueue(zips, foundry, parallelism)&#10;        } else {&#10;            LOGGER.info(&quot;Processing zip files sequentially&quot;)&#10;            Arrays.stream(zips).forEachOrdered { zipFilePath -&gt;&#10;                processZipFileSequentially((zipFilePath ?: &quot;&quot;).toString(), getFoundryFromZipFileNames(zips))&#10;            }&#10;        }&#10;&#10;        if (annotationWorkerPool != null) {&#10;            LOGGER.info(&quot;closing worker pool&quot;)&#10;            annotationWorkerPool?.close()&#10;        }&#10;        // Shutdown entry executor&#10;        entryExecutor?.shutdown()&#10;    }&#10;&#10;    private fun processZipsWithQueue(zips: Array&lt;String&gt;, foundry: String, parallelism: Int) {&#10;        val queue: java.util.concurrent.BlockingQueue&lt;String&gt; = java.util.concurrent.LinkedBlockingQueue()&#10;        zips.forEach { queue.put(it) }&#10;        val executor = Executors.newFixedThreadPool(parallelism)&#10;        val active = java.util.concurrent.atomic.AtomicInteger(0)&#10;        repeat(parallelism) {&#10;            executor.submit {&#10;                active.incrementAndGet()&#10;                try {&#10;                    while (true) {&#10;                        val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS)&#10;                        if (zipPath == null) {&#10;                            if (queue.isEmpty()) break else continue&#10;                        }&#10;                        if (sequentialInZip) {&#10;                            processZipFileSequentially(zipPath, foundry)&#10;                        } else {&#10;                            processZipFile(zipPath, foundry)&#10;                        }&#10;                    }&#10;                } finally {&#10;                    active.decrementAndGet()&#10;                }&#10;            }&#10;        }&#10;        executor.shutdown()&#10;        try {&#10;            executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS)&#10;        } catch (ie: InterruptedException) {&#10;            Thread.currentThread().interrupt()&#10;        }&#10;    }&#10;&#10;    // Convert a shell-like glob to a Regex: '*' -&gt; &quot;.*&quot;, '?' -&gt; '.', anchored full match&#10;    private fun globToRegex(glob: String): Regex {&#10;        val sb = StringBuilder(&quot;^&quot;)&#10;        glob.forEach { ch -&gt;&#10;            when (ch) {&#10;                '*' -&gt; sb.append(&quot;.*&quot;)&#10;                '?' -&gt; sb.append('.')&#10;                '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -&gt; sb.append('\\').append(ch)&#10;                else -&gt; sb.append(ch)&#10;            }&#10;        }&#10;        sb.append('$')&#10;        return Regex(sb.toString())&#10;    }&#10;&#10;&#10;    private fun getTokenSpansFromMorho(morpho: MutableMap&lt;String, MorphoSpan&gt;): Array&lt;Span&gt; {&#10;        return morpho.keys.map { key -&gt;&#10;            val fromTo = key.split(&quot;-&quot;)&#10;            Span(fromTo[0].toInt(), fromTo[1].toInt())&#10;        }.sortedBy {&#10;            it.from&#10;        }.toTypedArray()&#10;    }&#10;&#10;    private fun getFoundryFromZipFileName(zipFileName: String): String {&#10;        if (!zipFileName.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) {&#10;            return &quot;base&quot;&#10;        }&#10;        return zipFileName.replace(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;), &quot;$1&quot;)&#10;    }&#10;&#10;    private fun getFoundryFromZipFileNames(zipFileNames: Array&lt;String&gt;): String {&#10;        for (zipFileName in zipFileNames) {&#10;            val foundry = getFoundryFromZipFileName(zipFileName)&#10;            if (foundry != &quot;base&quot;) {&#10;                return foundry&#10;            }&#10;        }&#10;        return &quot;base&quot;&#10;    }&#10;&#10;    private fun processZipFile(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10;        val ord = zipOrdinals[zipFilePath] ?: 0&#10;        val size = zipSizes[zipFilePath] ?: 0L&#10;        LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10;        LOGGER.info(&quot;Foundry: $foundry $dbFactory&quot;)&#10;        if (outputFormat == OutputFormat.KORAPXML &amp;&amp; dbFactory == null) {&#10;            var targetFoundry = &quot;base&quot;&#10;            if (taggerName != null) {&#10;                val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10;                if (tagger != null) {&#10;                    targetFoundry = tagger.foundry&#10;                }&#10;            } else if (parserName != null) {&#10;                targetFoundry = parserName!!&#10;            }&#10;            dbFactory = DocumentBuilderFactory.newInstance()&#10;            dBuilder = dbFactory!!.newDocumentBuilder()&#10;            val outputMorphoZipFileName =&#10;                if (parserName != null)&#10;                    zipFilePath.replace(Regex(&quot;(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$&quot;), &quot;.&quot;.plus(parserName).plus(&quot;.zip&quot;))&#10;                else&#10;                    zipFilePath.replace(Regex(&quot;\\.zip$&quot;), &quot;.&quot;.plus(targetFoundry).plus(&quot;.zip&quot;))&#10;            if (File(outputMorphoZipFileName).exists() &amp;&amp; !overwrite) {&#10;                LOGGER.severe(&quot;Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.&quot;)&#10;                exitProcess(1)&#10;            }&#10;            val fileOutputStream = FileOutputStream(outputMorphoZipFileName)&#10;            morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply {&#10;                setUseZip64(Zip64Mode.Always)&#10;            }&#10;        }&#10;        if (zipFilePath.hasCorrespondingBaseZip()) {&#10;            val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10;            // Process related zips one after another to keep the ZipFile lifetime strictly bounded&#10;            relatedZips.forEach { zip -&gt;&#10;                ZipFile(zip).use { zipFile -&gt;&#10;                    processZipEntriesWithPool(zipFile, foundry, true)&#10;                }&#10;            }&#10;        } else {&#10;            ZipFile(zipFilePath).use { zipFile -&gt;&#10;                processZipEntriesWithPool(zipFile, foundry, false)&#10;            }&#10;        }&#10;        if (outputFormat == OutputFormat.KORAPXML) {&#10;            morphoZipOutputStream!!.close()&#10;        }&#10;        logZipProgress(zipFilePath)&#10;    }&#10;&#10;    private fun processZipFileSequentially(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10;        val ord = zipOrdinals[zipFilePath] ?: 0&#10;        val size = zipSizes[zipFilePath] ?: 0L&#10;        LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10;        if (zipFilePath.hasCorrespondingBaseZip()) {&#10;            // Process the two related zips strictly sequentially to limit memory growth&#10;            val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10;            zips.forEach { zip -&gt;&#10;                ZipFile(zip).use { zipFile -&gt;&#10;                    // Iterate entries in a deterministic order to keep related files close together&#10;                    zipFile.stream()&#10;                        .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10;                        .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10;                        .forEachOrdered { zipEntry -&gt;&#10;                            processZipEntry(zipFile, foundry, zipEntry, true)&#10;                        }&#10;                }&#10;            }&#10;        } else {&#10;            ZipFile(zipFilePath).use { zipFile -&gt;&#10;                zipFile.stream()&#10;                    .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10;                    .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10;                    .forEachOrdered { zipEntry -&gt;&#10;                        processZipEntry(zipFile, foundry, zipEntry, false)&#10;                    }&#10;            }&#10;        }&#10;        logZipProgress(zipFilePath)&#10;    }&#10;&#10;    private fun logZipProgress(zipFilePath: String) {&#10;        try {&#10;            val size = zipSizes[zipFilePath] ?: 0L&#10;            val done = processedZipBytes.addAndGet(size)&#10;            val total = if (totalZipBytes &gt; 0) totalZipBytes else 1L&#10;            val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1)&#10;            val speedBytesPerSec = (done * 1000.0) / elapsedMs&#10;            val remaining = (total - done).coerceAtLeast(0)&#10;            val etaSeconds = if (speedBytesPerSec &gt; 0.0) (remaining / speedBytesPerSec).toLong() else -1L&#10;            val ord = zipOrdinals[zipFilePath] ?: 0&#10;            val pct = (done * 100.0 / total).coerceIn(0.0, 100.0)&#10;            val humanSpeed = String.format(Locale.ROOT, &quot;%.2f MB/s&quot;, speedBytesPerSec / (1024.0 * 1024.0))&#10;            val etaStr = if (etaSeconds &gt;= 0) formatDuration(etaSeconds) else &quot;unknown&quot;&#10;            LOGGER.info(&#10;                &quot;Finished zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} &quot; +&#10;                        &quot;(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, &quot;%.1f&quot;, pct)}%%, &quot; +&#10;                        &quot;ETA ${etaStr} at ${humanSpeed}&quot;&#10;            )&#10;        } catch (e: Exception) {&#10;            LOGGER.fine(&quot;Failed to log zip progress for $zipFilePath: ${e.message}&quot;)&#10;        }&#10;    }&#10;&#10;    private fun humanBytes(bytes: Long): String {&#10;        if (bytes &lt; 1024) return &quot;$bytes B&quot;&#10;        val kb = bytes / 1024.0&#10;        if (kb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f KB&quot;, kb)&#10;        val mb = kb / 1024.0&#10;        if (mb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f MB&quot;, mb)&#10;        val gb = mb / 1024.0&#10;        return String.format(Locale.ROOT, &quot;%.1f GB&quot;, gb)&#10;    }&#10;&#10;    private fun formatDuration(seconds: Long): String {&#10;        var s = seconds&#10;        val h = s / 3600; s %= 3600&#10;        val m = s / 60; val sec = s % 60&#10;        return String.format(Locale.ROOT, &quot;%02d:%02d:%02d&quot;, h, m, sec)&#10;    }&#10;&#10;    private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) {&#10;        // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested&#10;        val entries: MutableList&lt;ZipEntry&gt; = ArrayList()&#10;        val enumEntries = zipFile.entries()&#10;        while (enumEntries.hasMoreElements()) {&#10;            val e = enumEntries.nextElement()&#10;            if (extractMetadataRegex.isEmpty() &amp;&amp; e.name.contains(&quot;header.xml&quot;)) continue&#10;            entries.add(e)&#10;        }&#10;        if (entries.isEmpty()) return&#10;&#10;        // If only one thread requested, do sequential to avoid pool overhead&#10;        if (maxThreads &lt;= 1) {&#10;            entries.forEach { entry -&gt; processZipEntry(zipFile, foundry, entry, waitForMorpho) }&#10;            return&#10;        }&#10;&#10;        // Submit all entry tasks to the shared executor and await completion before closing the zip&#10;        val latch = java.util.concurrent.CountDownLatch(entries.size)&#10;        entries.forEach { entry -&gt;&#10;            entryExecutor?.execute {&#10;                try {&#10;                    processZipEntry(zipFile, foundry, entry, waitForMorpho)&#10;                } catch (t: Throwable) {&#10;                    LOGGER.warning(&quot;Failed to process entry ${entry.name}: ${t.message}&quot;)&#10;                } finally {&#10;                    latch.countDown()&#10;                }&#10;            }&#10;        }&#10;        try {&#10;            latch.await()&#10;        } catch (ie: InterruptedException) {&#10;            Thread.currentThread().interrupt()&#10;        }&#10;    }&#10;&#10;    fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {&#10;        var foundry = _foundry&#10;        var waitForMorpho = passedWaitForMorpho&#10;        LOGGER.finer(&quot;Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}&quot;)&#10;        if (taggerName != null &amp;&amp; !taggerToolBridges.containsKey(Thread.currentThread().threadId())) {&#10;            val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10;            taggerToolBridges[Thread.currentThread().threadId()] = tagger&#10;            if (tagger != null) {&#10;                foundry = tagger.foundry&#10;            }&#10;&#10;        }&#10;        if (parserName != null &amp;&amp; !parserToolBridges.containsKey(Thread.currentThread().threadId())) {&#10;            val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?&#10;            parserToolBridges[Thread.currentThread().threadId()] = parser&#10;            if (parser != null) {&#10;                foundry = &quot;$foundry dependency:${parser.foundry}&quot;&#10;                LOGGER.fine(&quot;Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}&quot;)&#10;            }&#10;        }&#10;&#10;        try {&#10;            if (zipEntry.name.matches(Regex(&quot;.*(data|tokens|structure|morpho)\\.xml$&quot;))) {&#10;                // Ensure the entry stream and reader are closed to avoid native memory buildup&#10;                val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()&#10;                val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()&#10;                // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure&#10;                if (lemmaOnly &amp;&amp; zipEntry.name.endsWith(&quot;data.xml&quot;)) {&#10;                    return&#10;                }&#10;                val doc: Document = try {&#10;                    zipFile.getInputStream(zipEntry).use { inputStream -&gt;&#10;                        XMLCommentFilterReader(inputStream, &quot;UTF-8&quot;).use { reader -&gt;&#10;                            dBuilder.parse(InputSource(reader))&#10;                        }&#10;                    }&#10;                } catch (e: SAXParseException) {&#10;                    LOGGER.warning(&quot;Error parsing file: &quot; + zipEntry.name + &quot; &quot; + e.message)&#10;                    return&#10;                }&#10;&#10;                doc.documentElement.normalize()&#10;                val docId: String = doc.documentElement.getAttribute(&quot;docid&quot;)&#10;                if (siglePattern != null &amp;&amp; !Regex(siglePattern!!).containsMatchIn(docId)) {&#10;                    return&#10;                }&#10;                // LOGGER.info(&quot;Processing file: &quot; + zipEntry.getName())&#10;                val fileName = zipEntry.name.replace(Regex(&quot;.*?/([^/]+\\.xml)$&quot;), &quot;$1&quot;)&#10;                when (fileName) {&#10;                    &quot;data.xml&quot; -&gt; {&#10;                        if (!lemmaOnly) {&#10;                            val textsList: NodeList = doc.getElementsByTagName(&quot;text&quot;)&#10;                            if (textsList.length &gt; 0) {&#10;                                texts[docId] = NonBmpString(textsList.item(0).textContent)&#10;                            }&#10;                        }&#10;                    }&#10;&#10;                    &quot;structure.xml&quot; -&gt; {&#10;                        val spans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        if (extractAttributesRegex.isNotEmpty())&#10;                            extraFeatures[docId] = extractMiscSpans(spans)&#10;                        sentences[docId] = extractSentenceSpans(spans)&#10;&#10;                    }&#10;&#10;                    &quot;tokens.xml&quot; -&gt; {&#10;                        if (!fnames.contains(docId)) {&#10;                            fnames[docId] = zipEntry.name&#10;                        }&#10;                        val tokenSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        tokens[docId] = extractSpans(tokenSpans)&#10;                    }&#10;&#10;                    &quot;morpho.xml&quot; -&gt; {&#10;                        waitForMorpho = true&#10;                        fnames[docId] = zipEntry.name&#10;                        val fsSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        morpho[docId] = extractMorphoSpans(fsSpans)&#10;                        tokens[docId] = extractSpans(fsSpans)&#10;                    }&#10;                }&#10;&#10;                val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10;                // For lemma-only/lemma-based word2vec/now, we can proceed without full text&#10;                val textRequired = when (outputFormat) {&#10;                    OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10;                    else -&gt; true&#10;                }&#10;                if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10;                    &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10;                    &amp;&amp; (extractMetadataRegex.isEmpty() || metadata[docId] != null)&#10;                ) {&#10;                    // Be quiet on INFO; per-text logs only on FINE and below&#10;                    LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                    processText(docId, foundry)&#10;                }&#10;            } else if (extractMetadataRegex.isNotEmpty() &amp;&amp; zipEntry.name.matches(Regex(&quot;.*/header\\.xml$&quot;))) {&#10;                //LOGGER.info(&quot;Processing header file: &quot; + zipEntry.name)&#10;                val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }&#10;                val docId =&#10;                    Regex(&quot;&lt;textSigle&gt;([^&lt;]+)&lt;/textSigle&gt;&quot;).find(text)?.destructured?.component1()&#10;                        ?.replace(Regex(&quot;/&quot;), &quot;_&quot;)&#10;                LOGGER.fine(&quot;Processing header file: &quot; + zipEntry.name + &quot; docId: &quot; + docId)&#10;                val meta = ArrayList&lt;String&gt;()&#10;                extractMetadataRegex.forEach { regex -&gt;&#10;                    val match = Regex(regex).find(text)&#10;                    if (match != null) {&#10;                        meta.add(match.destructured.component1())&#10;                    }&#10;                }&#10;                if (meta.isNotEmpty() &amp;&amp; docId != null) {&#10;                    metadata[docId] = meta.toTypedArray()&#10;                    val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10;                    val textRequired = when (outputFormat) {&#10;                        OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10;                        else -&gt; true&#10;                    }&#10;                    if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10;                         &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10;                     ) {&#10;                        // Be quiet on INFO; per-text logs only on FINE and below&#10;                        LOGGER.fine(&quot;Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                        processText(docId, foundry)&#10;                    }&#10;                }&#10;            }&#10;        } catch (e: Exception) {&#10;            e.printStackTrace()&#10;        }&#10;    }&#10;&#10;    private fun processText(&#10;        docId: String,&#10;        foundry: String,&#10;    ) {&#10;        LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;        var morphoFoundry = getMorphoFoundry()&#10;        val output =&#10;        if (outputFormat == OutputFormat.WORD2VEC) {&#10;            lmTrainingOutput(docId)&#10;        } else if (outputFormat == OutputFormat.NOW) {&#10;            nowOutput(docId)&#10;        } else {&#10;            if (taggerToolBridges[Thread.currentThread().threadId()] != null) {&#10;                morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText(&#10;                    tokens[docId]!!,&#10;                    sentences[docId],&#10;                    texts[docId]!!&#10;                )&#10;&#10;            }&#10;            if (parserToolBridges[Thread.currentThread().threadId()] != null) {&#10;                if (morpho[docId] == null) {&#10;                    LOGGER.severe(&quot;No morpho data for $docId&quot;)&#10;                    //exitProcess(1)&#10;                }&#10;                LOGGER.finer(&quot;Parsing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText(&#10;                    tokens[docId]!!,&#10;                    morpho[docId],&#10;                    sentences[docId],&#10;                    texts[docId]!!&#10;                )&#10;                LOGGER.finer(&quot;Parsed text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;            }&#10;            if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotationWorkerPool == null) {&#10;                korapXmlOutput(getMorphoFoundry(), docId)&#10;            } else {&#10;                conlluOutput(foundry, docId)&#10;            }&#10;        }&#10;&#10;        if (annotationWorkerPool != null) {&#10;            annotationWorkerPool?.pushToQueue(output.append(&quot;\n# eot\n&quot;).toString())&#10;            // Release internal char[] early&#10;            output.setLength(0)&#10;        } else if (outputFormat != OutputFormat.KORAPXML) {&#10;            synchronized(System.out) {&#10;                println(output.toString())&#10;            }&#10;            // Release internal char[] early&#10;            output.setLength(0)&#10;        } else {&#10;            korapXmlOutput(foundry, docId)&#10;        }&#10;&#10;&#10;        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -&gt;&#10;            if (map === morpho) {&#10;                // Clear inner map to release references early&#10;                morpho[docId]?.clear()&#10;            }&#10;            map.remove(docId)&#10;        }&#10;&#10;        // Periodic GC hint after processing many docs (lightweight safeguard)&#10;        if ((processedDocs.incrementAndGet() % 2000) == 0) {&#10;            LOGGER.fine(&quot;Processed ${processedDocs.get()} docs – requesting GC hint&quot;)&#10;            System.gc()&#10;        }&#10;        // Memory / cache statistics logging&#10;        if (memStatsInterval &gt; 0) {&#10;            val count = processedDocs.get()&#10;            if (count % memStatsInterval == 0) {&#10;                logMemoryStats(count)&#10;            }&#10;        }&#10;&#10;        if (outputFormat == OutputFormat.KORAPXML) {&#10;            val entryPath = if (parserName != null)  docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$parserName/&quot;).plus(&quot;dependency.xml&quot;)&#10;            else&#10;                docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$morphoFoundry/&quot;).plus(&quot;morpho.xml&quot;)&#10;            val zipEntry = ZipArchiveEntry(entryPath)&#10;            zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE&#10;            synchronized(morphoZipOutputStream!!) {&#10;                morphoZipOutputStream!!.putArchiveEntry(zipEntry)&#10;                morphoZipOutputStream!!.write(output.toString().toByteArray())&#10;                morphoZipOutputStream!!.closeArchiveEntry()&#10;            }&#10;            output.clear()&#10;        }&#10;    }&#10;&#10;    private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: &quot;base&quot;&#10;&#10;    private fun logMemoryStats(count: Int) {&#10;        try {&#10;            val rt = Runtime.getRuntime()&#10;            val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024)&#10;            val total = rt.totalMemory() / (1024 * 1024)&#10;            val max = rt.maxMemory() / (1024 * 1024)&#10;            LOGGER.info(&#10;                &quot;MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} &quot; +&#10;                        &quot;maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}&quot;&#10;            )&#10;        } catch (e: Exception) {&#10;            LOGGER.warning(&quot;Failed to log memory stats: ${e.message}&quot;)&#10;        }&#10;    }&#10;&#10;    private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {&#10;        val doc: Document = dBuilder!!.newDocument()&#10;&#10;        // Root element&#10;        val layer = doc.createElement(&quot;layer&quot;)&#10;        layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10;        layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10;        layer.setAttribute(&quot;docid&quot;, docId)&#10;        doc.appendChild(layer)&#10;&#10;        val spanList = doc.createElement(&quot;spanList&quot;)&#10;        layer.appendChild(spanList)&#10;&#10;        var i = 0&#10;        var s = 0&#10;        var n = 0&#10;        val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split(&quot;-&quot;)[0].toInt() }&#10;&#10;        sortedKeys?.forEach { spanString -&gt;&#10;            val mfs = morpho[docId]?.get(spanString)&#10;            val offsets = spanString.split(&quot;-&quot;)&#10;            if(offsets.size != 2) {&#10;                LOGGER.warning(&quot;Invalid span: $spanString in $docId&quot;)&#10;                return@forEach&#10;            }&#10;            if (offsets[0].toInt() &gt; sentences[docId]!!.elementAt(s).to) {&#10;                s++&#10;                n = i&#10;            }&#10;            i++&#10;            if (mfs!!.deprel == &quot;_&quot;) {&#10;                return@forEach&#10;            }&#10;&#10;            val spanNode = doc.createElement(&quot;span&quot;)&#10;            spanNode.setAttribute(&quot;id&quot;, &quot;s${s + 1}_n${i - n}&quot;)&#10;            spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10;            spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10;            // rel element&#10;            val rel = doc.createElement(&quot;rel&quot;)&#10;            rel.setAttribute(&quot;label&quot;, mfs.deprel)&#10;&#10;            // inner span element&#10;            val innerSpan = doc.createElement(&quot;span&quot;)&#10;            val headInt = if(mfs.head == &quot;_&quot;) 0 else parseInt(mfs.head) - 1&#10;            if (headInt &lt; 0) {&#10;                innerSpan.setAttribute(&quot;from&quot;, sentences[docId]!!.elementAt(s).from.toString())&#10;                innerSpan.setAttribute(&quot;to&quot;,  sentences[docId]!!.elementAt(s).to.toString())&#10;            } else {&#10;                if (headInt + n &gt;= morpho[docId]!!.size) {&#10;                    LOGGER.warning(&quot;Head index out of bounds: ${headInt+n} &gt;= ${morpho[docId]!!.size} in $docId&quot;)&#10;                    return@forEach&#10;                } else {&#10;                    val destSpanString = sortedKeys.elementAt(headInt + n)&#10;                    val destOffsets = destSpanString.split(&quot;-&quot;)&#10;                    innerSpan.setAttribute(&quot;from&quot;, destOffsets[0])&#10;                    innerSpan.setAttribute(&quot;to&quot;, destOffsets[1])&#10;                }&#10;            }&#10;            rel.appendChild(innerSpan)&#10;            spanNode.appendChild(rel)&#10;            spanList.appendChild(spanNode)&#10;        }&#10;        val transformerFactory = TransformerFactory.newInstance()&#10;        val transformer = transformerFactory.newTransformer()&#10;        transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10;        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10;        transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10;        val domSource = DOMSource(doc)&#10;        val streamResult = StreamResult(StringWriter())&#10;        transformer.transform(domSource, streamResult)&#10;&#10;        return StringBuilder(streamResult.writer.toString())&#10;    }&#10;&#10;    private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {&#10;        return if (parserName != null) {&#10;            korapXmlDependencyOutput(foundry, docId)&#10;        } else {&#10;            korapXmlMorphoOutput(foundry, docId)&#10;        }&#10;    }&#10;&#10;    private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {&#10;            val doc: Document = dBuilder!!.newDocument()&#10;&#10;        // Root element&#10;        val layer = doc.createElement(&quot;layer&quot;)&#10;        layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10;        layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10;        layer.setAttribute(&quot;docid&quot;, docId)&#10;        doc.appendChild(layer)&#10;&#10;        val spanList = doc.createElement(&quot;spanList&quot;)&#10;        layer.appendChild(spanList)&#10;&#10;        var i = 0&#10;        morpho[docId]?.forEach { (spanString, mfs) -&gt;&#10;            i++&#10;            val offsets = spanString.split(&quot;-&quot;)&#10;            val spanNode = doc.createElement(&quot;span&quot;)&#10;            spanNode.setAttribute(&quot;id&quot;, &quot;t_$i&quot;)&#10;            spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10;            spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10;            // fs element&#10;            val fs = doc.createElement(&quot;fs&quot;)&#10;            fs.setAttribute(&quot;type&quot;, &quot;lex&quot;)&#10;            fs.setAttribute(&quot;xmlns&quot;, &quot;http://www.tei-c.org/ns/1.0&quot;)&#10;            spanNode.appendChild(fs)&#10;            val f = doc.createElement(&quot;f&quot;)&#10;            f.setAttribute(&quot;name&quot;, &quot;lex&quot;)&#10;            fs.appendChild(f)&#10;&#10;            // Inner fs element&#10;            val innerFs = doc.createElement(&quot;fs&quot;)&#10;            f.appendChild(innerFs)&#10;&#10;            if (mfs.lemma != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;lemma&quot;)&#10;                innerF.textContent = mfs.lemma&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.upos != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;upos&quot;)&#10;                innerF.textContent = mfs.upos&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.xpos != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;pos&quot;)&#10;                innerF.textContent = mfs.xpos&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.feats != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;msd&quot;)&#10;                innerF.textContent = mfs.feats&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.misc != &quot;_&quot; &amp;&amp; mfs.misc!!.matches(Regex(&quot;^[0-9.]+$&quot;))) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;certainty&quot;)&#10;                innerF.textContent = mfs.misc&#10;                innerFs.appendChild(innerF)&#10;            }&#10;&#10;            spanList.appendChild(spanNode)&#10;        }&#10;        val transformerFactory = TransformerFactory.newInstance()&#10;        val transformer = transformerFactory.newTransformer()&#10;        transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10;        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10;        transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10;        val domSource = DOMSource(doc)&#10;        val streamResult = StreamResult(StringWriter())&#10;        transformer.transform(domSource, streamResult)&#10;&#10;        return StringBuilder(streamResult.writer.toString())&#10;&#10;    }&#10;&#10;    private fun conlluOutput(foundry: String, docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder&#10;        output =&#10;            StringBuilder(&quot;# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n&quot;).append(&#10;                tokenOffsetsInSentence(&#10;                    sentences, docId, sentence_index, real_token_index, tokens&#10;                )&#10;            )&#10;        if (extractMetadataRegex.isNotEmpty()) {&#10;            output.append(metadata[docId]?.joinToString(&quot;\t&quot;, prefix = &quot;# metadata=&quot;, postfix = &quot;\n&quot;) ?: &quot;&quot;)&#10;        }&#10;        var previousSpanStart = 0&#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to) {&#10;                output.append(&quot;\n&quot;)&#10;                sentence_index++&#10;                token_index = 1&#10;                output.append(&#10;                    tokenOffsetsInSentence(&#10;                        sentences, docId, sentence_index, real_token_index, tokens&#10;                    )&#10;                )&#10;            }&#10;            if (extractAttributesRegex.isNotEmpty() &amp;&amp; extraFeatures[docId] != null) {&#10;                for (i in previousSpanStart until span.from + 1) {&#10;                    if (extraFeatures[docId]?.containsKey(&quot;$i&quot;) == true) {&#10;                        output.append(extraFeatures[docId]!![&quot;$i&quot;])&#10;                        extraFeatures[docId]!!.remove(&quot;$i&quot;)&#10;                    }&#10;                }&#10;                previousSpanStart = span.from + 1&#10;            }&#10;            if (morpho[docId]?.containsKey(&quot;${span.from}-${span.to}&quot;) == true) {&#10;                val mfs = morpho[docId]!![&quot;${span.from}-${span.to}&quot;]&#10;                if (span.to &gt; texts[docId]!!.length) {&#10;                    span.to = texts[docId]!!.length&#10;                    LOGGER.warning(&#10;                        &quot;Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${&#10;                            texts[docId]!!.substring(&#10;                                span.from,&#10;                                span.to&#10;                            )&#10;                        }&quot;&#10;                    )&#10;                }&#10;                output.append(&#10;                    printConlluToken(&#10;                        token_index,&#10;                        texts[docId]!!.substring(span.from, span.to),&#10;                        mfs!!.lemma!!,&#10;                        mfs.upos!!,&#10;                        mfs.xpos!!,&#10;                        mfs.feats!!,&#10;                        mfs.head!!,&#10;                        mfs.deprel!!,&#10;                        mfs.deps!!,&#10;                        mfs.misc!!,&#10;                        columns&#10;                    )&#10;                )&#10;            } else {&#10;                output.append(&#10;                    printConlluToken(&#10;                        token_index, texts[docId]!!.substring(span.from, span.to), columns = columns&#10;                    )&#10;                )&#10;            }&#10;            real_token_index++&#10;        }&#10;        return output&#10;    }&#10;&#10;    private fun lmTrainingOutput(docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder&#10;        output = StringBuilder()&#10;        if (extractMetadataRegex.isNotEmpty()) {&#10;            output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10;        }&#10;        // If no text is available (e.g., lemma-only mode), emit lemmas&#10;        if (texts[docId] == null) {&#10;            tokens[docId]?.forEach { span -&gt;&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]?.get(key)?.lemma&#10;                output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10;            }&#10;            if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)&#10;            return output&#10;        }&#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                if (output.isNotEmpty()) {&#10;                    output.setCharAt(output.length - 1, '\n')&#10;                } else {&#10;                    output.append(&quot;\n&quot;)&#10;                }&#10;                if (extractMetadataRegex.isNotEmpty() &amp;&amp; real_token_index &lt; tokens[docId]!!.size - 1) {&#10;                    output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10;                }&#10;                sentence_index++&#10;            }&#10;            // Bounds safety&#10;            val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10;            val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10;            if (useLemma &amp;&amp; morpho[docId] != null) {&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]!![key]?.lemma&#10;                if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10;                    output.append(lemmaVal)&#10;                    output.append(' ')&#10;                } else {&#10;                    texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                    output.append(' ')&#10;                }&#10;            } else {&#10;                texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                output.append(' ')&#10;            }&#10;            real_token_index++&#10;        }&#10;        if (output.isNotEmpty()) {&#10;            output.deleteCharAt(output.length - 1)&#10;        }&#10;        return output&#10;    }&#10;&#10;    private fun nowOutput(docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder = StringBuilder()&#10;        &#10;        // Add the text sigle prefix&#10;        output.append(&quot;@@$docId &quot;)&#10;        &#10;        if (texts[docId] == null) {&#10;            // Lemma-only fallback when original text is not loaded&#10;            tokens[docId]?.forEach { span -&gt;&#10;                if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                    if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10;                        output.append(&quot; &lt;p&gt; &quot;)&#10;                    }&#10;                    sentence_index++&#10;                }&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]?.get(key)?.lemma&#10;                output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10;            }&#10;            if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10;                output.deleteCharAt(output.length - 1)&#10;            }&#10;            return output&#10;        }&#10;        &#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                // Replace sentence end with &lt;p&gt; tag instead of newline&#10;                if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10;                    output.append(&quot; &lt;p&gt; &quot;)&#10;                }&#10;                sentence_index++&#10;            }&#10;            // Bounds safety&#10;            val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10;            val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10;            if (useLemma &amp;&amp; morpho[docId] != null) {&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]!![key]?.lemma&#10;                if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10;                    output.append(lemmaVal)&#10;                    output.append(' ')&#10;                } else {&#10;                    texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                    output.append(' ')&#10;                }&#10;            } else {&#10;                texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                output.append(' ')&#10;            }&#10;            real_token_index++&#10;        }&#10;        &#10;        // Remove trailing space and add final newline&#10;        if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10;            output.deleteCharAt(output.length - 1)&#10;        }&#10;        &#10;        return output&#10;    }&#10;&#10;&#10;    private fun printConlluToken(&#10;        token_index: Int,&#10;        token: String,&#10;        lemma: String = &quot;_&quot;,&#10;        upos: String = &quot;_&quot;,&#10;        xpos: String = &quot;_&quot;,&#10;        feats: String = &quot;_&quot;,&#10;        head: String = &quot;_&quot;,&#10;        deprel: String = &quot;_&quot;,&#10;        deps: String = &quot;_&quot;,&#10;        misc: String = &quot;_&quot;,&#10;        columns: Int = 10&#10;    ): String {&#10;        val myUpos = if (COMPATIBILITY_MODE &amp;&amp; upos == &quot;_&quot;) xpos else upos&#10;        return when (columns) {&#10;            1 -&gt; (&quot;$token\n&quot;)&#10;            10 -&gt; (&quot;$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator&quot;)&#10;            else -&gt; {&#10;                val fields = listOf(&#10;                    token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc&#10;                )&#10;                fields.subList(0, min(columns, 10)).joinToString(&quot;\t&quot;, postfix = tokenSeparator)&#10;            }&#10;        }&#10;    }&#10;&#10;    private fun tokenOffsetsInSentence(&#10;        sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;,&#10;        docId: String,&#10;        sentence_index: Int,&#10;        token_index: Int,&#10;        tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;&#10;    ): String {&#10;        if (sentences[docId] == null || sentences[docId]!!.size &lt;= sentence_index) {&#10;            return &quot;&quot;&#10;        }&#10;        val sentenceEndOffset = sentences[docId]!![sentence_index].to&#10;        var i = token_index&#10;        val start_offsets_string = StringBuilder()&#10;        val end_offsets_string = StringBuilder()&#10;        while (tokens[docId] != null &amp;&amp; i &lt; tokens[docId]!!.size &amp;&amp; tokens[docId]!![i].to &lt;= sentenceEndOffset) {&#10;            start_offsets_string.append(&quot; &quot;, tokens[docId]!![i].from)&#10;            end_offsets_string.append(&quot; &quot;, tokens[docId]!![i].to)&#10;            i++&#10;        }&#10;        return (&#10;                StringBuilder() .append(&#10;                    &quot;# start_offsets = &quot;, tokens[docId]!![token_index].from, start_offsets_string, &quot;\n&quot;,&#10;                    &quot;# end_offsets = &quot;, sentenceEndOffset, end_offsets_string, &quot;\n&quot;&#10;                ).toString())&#10;    }&#10;&#10;    private fun extractSpans(spans: NodeList): Array&lt;Span&gt; {&#10;        val list = ArrayList&lt;Span&gt;()&#10;        IntStream.range(0, spans.length).forEach { idx -&gt;&#10;            val node = spans.item(idx)&#10;            if (node is Element) {&#10;                val fromAttr = node.getAttribute(&quot;from&quot;)&#10;                val toAttr = node.getAttribute(&quot;to&quot;)&#10;                if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {&#10;                    LOGGER.warning(&quot;Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'&quot;)&#10;                } else {&#10;                    try {&#10;                        val from = Integer.parseInt(fromAttr)&#10;                        val to = Integer.parseInt(toAttr)&#10;                        list.add(Span(from, to))&#10;                    } catch (e: NumberFormatException) {&#10;                        LOGGER.warning(&quot;Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}&quot;)&#10;                    }&#10;                }&#10;            }&#10;        }&#10;        return list.toTypedArray()&#10;    }&#10;&#10;    private fun extractMorphoSpans(&#10;        fsSpans: NodeList&#10;    ): MutableMap&lt;String, MorphoSpan&gt; {&#10;        val UNKNOWN = Regex(&quot;(UNKNOWN|&lt;unknown&gt;)&quot;)&#10;        val res: MutableMap&lt;String, MorphoSpan&gt; = HashMap()&#10;        IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -&gt; node is Element &amp;&amp; node.getAttribute(&quot;type&quot;) != &quot;alt&quot; }.forEach { node -&gt;&#10;                val features = (node as Element).getElementsByTagName(&quot;f&quot;)&#10;                val fs = MorphoSpan()&#10;                val fromTo = &quot;${node.getAttribute(&quot;from&quot;)}-${node.getAttribute(&quot;to&quot;)}&quot;&#10;                IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -&gt;&#10;                        val attr = (feature as Element).getAttribute(&quot;name&quot;)&#10;                        val value = feature.textContent.trim()&#10;                        if (value.isEmpty()) return@forEach&#10;                        when (attr) {&#10;                            &quot;lemma&quot; -&gt; if(fs.lemma == &quot;_&quot;) fs.lemma = value.replace(UNKNOWN, &quot;--&quot;)&#10;                            &quot;upos&quot; -&gt; fs.upos = value&#10;                            &quot;xpos&quot;, &quot;ctag&quot;, &quot;pos&quot; -&gt; if(fs.xpos == &quot;_&quot;) fs.xpos = value.replace(UNKNOWN, &quot;--&quot;)&#10;                            &quot;feats&quot;, &quot;msd&quot; -&gt; if(fs.feats == &quot;_&quot; ) fs.feats = value&#10;                            &quot;type&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats = feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent.trim()&#10;                            // &quot;subtype&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats += &quot;:&quot; + feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent&#10;                            &quot;certainty&quot; -&gt; if(fs.misc == &quot;_&quot;) fs.misc = value&#10;                        }&#10;                    }&#10;                res[fromTo] = fs&#10;            }&#10;        return res&#10;    }&#10;&#10;    private fun extractSentenceSpans(spans: NodeList): Array&lt;Span&gt; {&#10;        return IntStream.range(0, spans.length).mapToObj(spans::item)&#10;            .filter { node -&gt; node is Element &amp;&amp; node.getElementsByTagName(&quot;f&quot;).item(0).textContent.equals(&quot;s&quot;) }&#10;            .map { node -&gt;&#10;                Span(&#10;                    Integer.parseInt((node as Element).getAttribute(&quot;from&quot;)), Integer.parseInt(node.getAttribute(&quot;to&quot;))&#10;                )&#10;            }.toArray { size -&gt; arrayOfNulls(size) }&#10;    }&#10;&#10;    /*&#10;     &lt;span id=&quot;s15&quot; from=&quot;370&quot; to=&quot;394&quot; l=&quot;5&quot;&gt;&#10;      &lt;fs type=&quot;struct&quot; xmlns=&quot;http://www.tei-c.org/ns/1.0&quot;&gt;&#10;        &lt;f name=&quot;name&quot;&gt;posting&lt;/f&gt;&#10;        &lt;f name=&quot;attr&quot;&gt;&#10;          &lt;fs type=&quot;attr&quot;&gt;&#10;            &lt;f name=&quot;id&quot;&gt;i.10894_1_3&lt;/f&gt;&#10;            &lt;f name=&quot;indentLevel&quot;&gt;0&lt;/f&gt;&#10;            &lt;f name=&quot;who&quot;&gt;WU00000000&lt;/f&gt;&#10;          &lt;/fs&gt;&#10;        &lt;/f&gt;&#10;      &lt;/fs&gt;&#10;    &lt;/span&gt;&#10;&#10;     */&#10;    private fun extractMiscSpans(spans: NodeList): MutableMap&lt;String, String&gt; {&#10;        val miscLocal: MutableMap&lt;String, String&gt; = HashMap()&#10;&#10;        IntStream.range(0, spans.length).mapToObj(spans::item)&#10;            .filter { node -&gt;&#10;                node is Element&#10;                        &amp;&amp; node.getElementsByTagName(&quot;f&quot;).length &gt; 1&#10;                        &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(0) as Element).getAttribute(&quot;name&quot;).equals(&quot;name&quot;)&#10;                        &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getAttribute(&quot;name&quot;).equals(&quot;attr&quot;)&#10;            }&#10;            .forEach { node -&gt;&#10;                if (node == null) return@forEach&#10;                val elementName = (node as Element).getElementsByTagName(&quot;f&quot;).item(0).textContent.trim()&#10;                val from = node.getAttribute(&quot;from&quot;)&#10;                val attributes = (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getElementsByTagName(&quot;f&quot;)&#10;                val res = StringBuilder()&#10;                IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -&gt;&#10;                    val attrName = &quot;$elementName/${(attr as Element).getAttribute(&quot;name&quot;)}&quot;&#10;                    if (attrName.matches(Regex(extractAttributesRegex))) {&#10;                         res.append(&quot;# $attrName = ${attr.textContent}\n&quot;)&#10;                        //LOGGER.info(&quot;&quot; + from + &quot;: $attrName = &quot; + attr.textContent)&#10;                    }&#10;&#10;                }&#10;                if (res.isNotEmpty()) {&#10;                    if (miscLocal.containsKey(from)) {&#10;                        // LOGGER.info(&quot;ADDING TO $from: ${miscLocal[from]}&quot;)&#10;                        miscLocal[from] += res.toString()&#10;                    } else {&#10;                        miscLocal[from] = res.toString()&#10;                    }&#10;                }&#10;            }&#10;        return miscLocal&#10;    }&#10;&#10;&#10;    class Span(var from: Int, var to: Int)&#10;&#10;    class MorphoSpan(&#10;        var lemma: String? = &quot;_&quot;,&#10;        var upos: String? = &quot;_&quot;,&#10;        var xpos: String? = &quot;_&quot;,&#10;        var feats: String? = &quot;_&quot;,&#10;        var head: String? = &quot;_&quot;,&#10;        var deprel: String? = &quot;_&quot;,&#10;        var deps: String? = &quot;_&quot;,&#10;        var misc: String? = &quot;_&quot;&#10;    )&#10;&#10;}&#10;&#10;fun main(args: Array&lt;String&gt;): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args))&#10;&#10;fun debug(args: Array&lt;String&gt;): Int {&#10;    return (CommandLine(KorapXmlTool()).execute(*args))&#10;}&#10;&#10;enum class OutputFormat {&#10;    CONLLU, WORD2VEC, KORAPXML, NOW&#10;}&#10;&#10;object ConlluOutputFormat {&#10;    const val NAME = &quot;conllu&quot;&#10;}&#10;&#10;object Word2VecOutputFormat {&#10;    const val NAME = &quot;word2vec&quot;&#10;}&#10;&#10;object KorapXmlOutputFormat {&#10;    const val NAME = &quot;korapxml&quot;&#10;}&#10;&#10;object NowOutputFormat {&#10;    const val NAME = &quot;now&quot;&#10;}&#10;" />
               <option name="updatedContent" value="package de.ids_mannheim.korapxmltools&#10;&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries&#10;import org.apache.commons.compress.archivers.zip.Zip64Mode&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveEntry&#10;import org.w3c.dom.Document&#10;import org.w3c.dom.Element&#10;import org.w3c.dom.NodeList&#10;import org.xml.sax.InputSource&#10;import org.xml.sax.SAXParseException&#10;import picocli.CommandLine&#10;import picocli.CommandLine.*&#10;import java.io.File&#10;import java.io.FileOutputStream&#10;import java.io.InputStream&#10;import java.io.StringWriter&#10;import java.lang.Integer.parseInt&#10;import java.util.*&#10;import java.util.concurrent.Callable&#10;import java.util.concurrent.ConcurrentHashMap&#10;import java.util.concurrent.Executors&#10;import java.util.concurrent.atomic.AtomicLong&#10;import java.util.logging.ConsoleHandler&#10;import java.util.logging.Level&#10;import java.util.logging.LogManager&#10;import java.util.logging.Logger&#10;import java.util.regex.Matcher&#10;import java.util.regex.Pattern&#10;import java.util.stream.IntStream&#10;import java.util.zip.ZipEntry&#10;&#10;import java.util.zip.ZipFile&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream&#10;import javax.xml.parsers.DocumentBuilder&#10;import javax.xml.parsers.DocumentBuilderFactory&#10;import javax.xml.transform.OutputKeys&#10;import javax.xml.transform.TransformerFactory&#10;import javax.xml.transform.dom.DOMSource&#10;import javax.xml.transform.stream.StreamResult&#10;import kotlin.math.min&#10;import kotlin.system.exitProcess&#10;&#10;val ZIP_ENTRY_UNIX_MODE = parseInt(&quot;644&quot;, 8)&#10;&#10;@Command(&#10;    name = &quot;KorapXmlTool&quot;,&#10;    mixinStandardHelpOptions = true,&#10;    version = [&quot;KorapXmlTool 2.0-beta-02&quot;],&#10;    description = [&quot;Converts KorAP-XML &lt;https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml&gt; base or &quot; +&#10;            &quot;morpho zips to (annotated) CoNLL(-U) format with all information necessary for &quot; +&#10;            &quot;reconstruction in comment lines.&quot;]&#10;)&#10;&#10;class KorapXmlTool : Callable&lt;Int&gt; {&#10;    val COMPATIBILITY_MODE = System.getenv(&quot;COMPATIBILITY_MODE&quot;) != null&#10;&#10;    @Spec lateinit var spec : Model.CommandSpec&#10;&#10;    @Parameters(arity = &quot;1..*&quot;, description = [&quot;At least one zip file name&quot;])&#10;    var zipFileNames: Array&lt;String&gt;? = null&#10;&#10;    @Option(&#10;        names = [&quot;-f&quot;, &quot;--output-format&quot;],&#10;        description = [&quot;Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}&quot;,&#10;            &quot;conllu: CoNLL-U format&quot;,&#10;            &quot;korapxml, xml, zip: KorAP-XML format zip&quot;,&#10;            &quot;word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines&quot;,&#10;            &quot;now, NOW: NOW corpus export format: w2v-like format with &lt;p&gt; tags for sentence ends and @@&lt;text-sigle&gt; prefix&quot;,&#10;        ],&#10;        converter = [OutputFormatConverter::class]&#10;    )&#10;    var outputFormat: OutputFormat = OutputFormat.CONLLU&#10;    class OutputFormatConverter : ITypeConverter&lt;OutputFormat&gt; {&#10;        override fun convert(value: String?): OutputFormat {&#10;            return when (value?.lowercase(Locale.getDefault())) {&#10;                &quot;conllu&quot;, &quot;conll&quot; -&gt; OutputFormat.CONLLU&#10;                &quot;word2vec&quot;, &quot;w2v&quot; -&gt; OutputFormat.WORD2VEC&#10;                &quot;korapxml&quot;, &quot;korap&quot;, &quot;xml&quot;, &quot;zip&quot; -&gt; OutputFormat.KORAPXML&#10;                &quot;now&quot;, &quot;NOW&quot; -&gt; OutputFormat.NOW&#10;                else -&gt; throw IllegalArgumentException(&quot;Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(&quot;, &quot;) { it.name }}&quot;)&#10;            }&#10;        }&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--sigle-pattern&quot;, &quot;-p&quot;],&#10;        paramLabel = &quot;PATTERN&quot;,&#10;        description = [&quot;Extract only documents with sigle matching the pattern (regex)&quot;]&#10;    )&#10;    var siglePattern: String? = null&#10;&#10;    @Option(&#10;        names = [&quot;--extract-attributes-regex&quot;, &quot;-e&quot;],&#10;        paramLabel = &quot;REGEX&quot;,&#10;        description = [&quot;Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.&quot;,&#10;            &quot;Example: -e '(posting/id|div/id)'&quot;]&#10;    )&#10;    var extractAttributesRegex: String = &quot;&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--s-bounds-from-morpho&quot;], description = [&quot;Not yet implemented: s bounds from morpho&quot;]&#10;    )&#10;    var sBoundsFromMorpho: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--log&quot;, &quot;-l&quot;],&#10;        paramLabel = &quot;LEVEL&quot;,&#10;        description = [&quot;Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${&quot;$&quot;}{DEFAULT-VALUE}])&quot;]&#10;    )&#10;    var logLevel: String = &quot;WARNING&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--columns&quot;, &quot;-c&quot;],&#10;        paramLabel = &quot;NUMBER&quot;,&#10;        description = [&quot;Number of columns. 1 means just the token. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;, &quot;Possible values: 1-10&quot;]&#10;    )&#10;    var columns: Int = 10&#10;&#10;    @Option(&#10;        names = [&quot;--word2vec&quot;, &quot;-w&quot;],&#10;        description = [&quot;Print text in LM training format: tokens separated by space, sentences separated by newline&quot;,&#10;            &quot;Deprecated: use -f word2vec&quot;]&#10;    )&#10;    fun setWord2Vec(word2vec: Boolean) {&#10;        if (word2vec) {&#10;            outputFormat = OutputFormat.WORD2VEC&#10;        }&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--exclude-zip-glob&quot;],&#10;        paramLabel = &quot;GLOB&quot;,&#10;        description = [&#10;            &quot;Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').&quot;,&#10;            &quot;May be repeated. Applied to basenames, not full paths.&quot;&#10;        ]&#10;    )&#10;    var excludeZipGlobs: MutableList&lt;String&gt; = mutableListOf()&#10;&#10;    @Option(&#10;        names = [&quot;--token-separator&quot;, &quot;-s&quot;],&#10;        paramLabel = &quot;STRING&quot;,&#10;        defaultValue = &quot;\n&quot;,&#10;        description = [&quot;Token separator. Default: new-line for CoNLL-U, space for word2vec format.&quot;]&#10;    )&#10;    var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) &quot; &quot; else &quot;\n&quot;&#10;&#10;    @Option(names = [&quot;--offsets&quot;], description = [&quot;Not yet implemented: offsets&quot;])&#10;    var offsets: Boolean = false&#10;&#10;    @Option(names = [&quot;--comments&quot;, &quot;-C&quot;], description = [&quot;Not yet implemented: comments&quot;])&#10;    var comments: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--extract-metadata-regex&quot;, &quot;-m&quot;],&#10;        paramLabel = &quot;REGEX&quot;,&#10;        description = [&quot;Extract metadata regexes.\nExample: -m '&lt;textSigle&gt;([^&lt;]+)' -m '&lt;creatDate&gt;([^&lt;]+)'&quot;]&#10;    )&#10;    var extractMetadataRegex: MutableList&lt;String&gt; = mutableListOf()&#10;&#10;    @Option(&#10;        names = [&quot;--annotate-with&quot;, &quot;-A&quot;],&#10;        paramLabel = &quot;COMMAND&quot;,&#10;        description = [&quot;Pipe output through command&quot;]&#10;    )&#10;    var annotateWith: String = &quot;&quot;&#10;&#10;    @Option(&#10;        names = [&quot;--threads&quot;, &quot;-T&quot;],&#10;        paramLabel = &quot;THREADS&quot;,&#10;        description = [&quot;Maximum number of threads to use. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;]&#10;    )&#10;    var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2&#10;    fun setThreads(threads: Int) {&#10;        if (threads &lt; 1) {&#10;            throw ParameterException(spec.commandLine(), String.format(&quot;Invalid value `%d' for option '--threads': must be at least 1&quot;, threads))&#10;        }&#10;        this.maxThreads = threads&#10;        System.setProperty(&quot;java.util.concurrent.ForkJoinPool.common.parallelism&quot;, threads.toString())&#10;    }&#10;&#10;    @Option(&#10;        names = [&quot;--zip-parallelism&quot;],&#10;        paramLabel = &quot;N&quot;,&#10;        description = [&quot;Maximum number of zip files to process concurrently. Defaults to --threads.&quot;]&#10;    )&#10;    var zipParallelism: Int? = null&#10;&#10;    @Option(&#10;        names = [&quot;--sequential&quot;],&#10;        description = [&#10;            &quot;Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now).&quot;&#10;        ]&#10;    )&#10;    var sequentialInZip: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--overwrite&quot;, &quot;-o&quot;],&#10;        description = [&quot;Overwrite existing files&quot;]&#10;    )&#10;    var overwrite: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--mem-stats-interval&quot;],&#10;        paramLabel = &quot;N&quot;,&#10;        description = [&quot;Log memory and cache statistics every N processed documents (0 disables; default: 0)&quot;]&#10;    )&#10;    var memStatsInterval: Int = 0&#10;&#10;    @Option(&#10;        names = [&quot;--lemma&quot;],&#10;        description = [&quot;In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)&quot;]&#10;    )&#10;    var useLemma: Boolean = false&#10;&#10;    @Option(&#10;        names = [&quot;--lemma-only&quot;],&#10;        description = [&#10;            &quot;Do not load texts from data.xml and output only lemmas (requires morpho.xml).&quot;,&#10;            &quot;Only valid with -f word2vec or -f now; implies --lemma.&quot;&#10;        ]&#10;    )&#10;    var lemmaOnly: Boolean = false&#10;&#10;    private var taggerName: String? = null&#10;    private var taggerModel: String? = null&#10;    @Option(&#10;        names = [&quot;--tag-with&quot;, &quot;-t&quot;],&#10;        paramLabel = &quot;TAGGER:MODEL&quot;,&#10;        description = [&quot;Specify a tagger and a model: ${taggerFoundries}:&lt;path/to/model&gt;.&quot;]&#10;    )&#10;    fun setTagWith(tagWith: String) {&#10;        val pattern: Pattern = Pattern.compile(&quot;(${taggerFoundries}):(.+)&quot;)&#10;        val matcher: Matcher = pattern.matcher(tagWith)&#10;        if (!matcher.matches()) {&#10;            throw ParameterException(spec.commandLine(),&#10;                String.format(&quot;Invalid value `%s' for option '--tag-with': &quot;+&#10;                    &quot;value does not match the expected pattern ${taggerFoundries}:&lt;path/to/model&gt;&quot;, tagWith))&#10;        } else {&#10;            taggerName = matcher.group(1)&#10;            taggerModel = matcher.group(2)&#10;            if (!File(taggerModel).exists()) {&#10;                throw ParameterException(spec.commandLine(),&#10;                    String.format(&quot;Invalid value for option '--tag-with':&quot;+&#10;                        &quot;model file '%s' does not exist&quot;, taggerModel, taggerModel))&#10;            }&#10;        }&#10;    }&#10;&#10;    private var parserName: String? = null&#10;    private var parserModel: String? = null&#10;    @Option(&#10;        names = [&quot;--parse-with&quot;, &quot;-P&quot;],&#10;        paramLabel = &quot;parser:MODEL&quot;,&#10;        description = [&quot;Specify a parser and a model: ${parserFoundries}:&lt;path/to/model&gt;.&quot;]&#10;    )&#10;    fun setParseWith(parseWith: String) {&#10;        val pattern: Pattern = Pattern.compile(&quot;(${parserFoundries}):(.+)&quot;)&#10;        val matcher: Matcher = pattern.matcher(parseWith)&#10;        if (!matcher.matches()) {&#10;            throw ParameterException(spec.commandLine(),&#10;                String.format(&quot;Invalid value `%s' for option '--parse-with': &quot;+&#10;                        &quot;value does not match the expected pattern (${parserFoundries}):&lt;path/to/model&gt;&quot;, parseWith))&#10;        } else {&#10;            parserName = matcher.group(1)&#10;            parserModel = matcher.group(2)&#10;            if (!File(parserModel).exists()) {&#10;                throw ParameterException(spec.commandLine(),&#10;                    String.format(&quot;Invalid value for option '--parse-with':&quot;+&#10;                            &quot;model file '%s' does not exist&quot;, parserModel, parserModel))&#10;            }&#10;        }&#10;    }&#10;&#10;&#10;    override fun call(): Int {&#10;        val handler = ConsoleHandler()&#10;        LogManager.getLogManager().reset()&#10;        handler.formatter = ColoredFormatter()&#10;&#10;        for (handler in LOGGER.handlers) {&#10;            LOGGER.removeHandler(handler)&#10;        }&#10;        LOGGER.addHandler(handler)&#10;        LOGGER.level = try {&#10;            Level.parse(logLevel.uppercase(Locale.getDefault()))&#10;        } catch (e: IllegalArgumentException) {&#10;            LOGGER.warning(&quot;Invalid log level: $logLevel. Defaulting to WARNING.&quot;)&#10;            Level.WARNING&#10;        }&#10;&#10;        if (lemmaOnly) {&#10;            useLemma = true&#10;            if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10;                throw ParameterException(spec.commandLine(), &quot;--lemma-only is supported only with -f word2vec or -f now&quot;)&#10;            }&#10;        }&#10;&#10;        LOGGER.info(&quot;Processing zip files: &quot; + zipFileNames!!.joinToString(&quot;, &quot;))&#10;&#10;        korapxml2conllu(zipFileNames!!)&#10;        return 0&#10;    }&#10;&#10;    private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name)&#10;&#10;    private var annotationWorkerPool : AnnotationWorkerPool? = null&#10;    // Shared executor for entry-level parallelism across all zips&#10;    private var entryExecutor: java.util.concurrent.ExecutorService? = null&#10;&#10;    val texts: ConcurrentHashMap&lt;String, NonBmpString&gt; = ConcurrentHashMap()&#10;    val sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10;    val tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10;    val morpho: ConcurrentHashMap&lt;String, MutableMap&lt;String, MorphoSpan&gt;&gt; = ConcurrentHashMap()&#10;    val fnames: ConcurrentHashMap&lt;String, String&gt; = ConcurrentHashMap()&#10;    val metadata: ConcurrentHashMap&lt;String, Array&lt;String&gt;&gt; = ConcurrentHashMap()&#10;    val extraFeatures: ConcurrentHashMap&lt;String, MutableMap&lt;String, String&gt;&gt; = ConcurrentHashMap()&#10;    private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0)&#10;    var taggerToolBridges: ConcurrentHashMap&lt;Long, TaggerToolBridge?&gt; = ConcurrentHashMap()&#10;    var parserToolBridges: ConcurrentHashMap&lt;Long, ParserToolBridge?&gt; = ConcurrentHashMap()&#10;&#10;    // Zip progress tracking for logging (zipNumber/zipTotal)&#10;    private val zipOrdinals: ConcurrentHashMap&lt;String, Int&gt; = ConcurrentHashMap()&#10;    private var totalZips: Int = 0&#10;    private val zipSizes: ConcurrentHashMap&lt;String, Long&gt; = ConcurrentHashMap()&#10;    private val processedZipBytes: AtomicLong = AtomicLong(0)&#10;    private var totalZipBytes: Long = 0&#10;    private var startTimeMillis: Long = 0&#10;&#10;    var dbFactory: DocumentBuilderFactory? = null&#10;    var dBuilder: DocumentBuilder? = null&#10;    var morphoZipOutputStream: ZipArchiveOutputStream? = null&#10;&#10;    fun String.hasCorrespondingBaseZip(): Boolean {&#10;        if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return false&#10;        val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10;        return File(baseZip).exists()&#10;    }&#10;&#10;    fun String.correspondingBaseZip(): String? {&#10;        if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return null&#10;        val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10;        return if (File(baseZip).exists()) baseZip else null&#10;    }&#10;&#10;    fun korapxml2conllu(args: Array&lt;String&gt;) {&#10;        if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotateWith.isNotEmpty()) {&#10;            LOGGER.severe(&quot;Shell command annotation is not yet supported with output format $outputFormat&quot;)&#10;            exitProcess(1)&#10;        }&#10;        // Initialize shared entry executor (used inside each zip)&#10;        entryExecutor = Executors.newFixedThreadPool(maxThreads)&#10;&#10;        if (annotateWith.isNotEmpty()) {&#10;            annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)&#10;        }&#10;&#10;        var zips: Array&lt;String&gt; = args&#10;        if (excludeZipGlobs.isNotEmpty()) {&#10;            val before = zips.size&#10;            val patterns = excludeZipGlobs.map { globToRegex(it) }&#10;            zips = zips.filter { zipPath -&gt;&#10;                val base = File(zipPath).name&#10;                patterns.none { rx -&gt; rx.matches(base) }&#10;            }.toTypedArray()&#10;            val excluded = before - zips.size&#10;            if (excluded &gt; 0) {&#10;                LOGGER.info(&quot;Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(&quot;, &quot;)}&quot;)&#10;            }&#10;        }&#10;        // Initialize zip progress tracking and sizes&#10;        startTimeMillis = System.currentTimeMillis()&#10;        processedZipBytes.set(0)&#10;        totalZips = zips.size&#10;        zipOrdinals.clear()&#10;        zipSizes.clear()&#10;        zips.forEach { zip -&gt; zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } }&#10;        totalZipBytes = zipSizes.values.sum()&#10;        // In lemma-only mode, process largest zips first&#10;        if (lemmaOnly) {&#10;            zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray()&#10;        }&#10;        zips.forEachIndexed { index, zip -&gt; zipOrdinals[zip] = index + 1 }&#10;&#10;        // Log zip order with sizes so the user can verify sorting&#10;        val totalHuman = humanBytes(totalZipBytes)&#10;        LOGGER.info(&quot;Zip processing order (${zips.size} file(s), total ${totalHuman}):&quot;)&#10;        zips.forEachIndexed { idx, zip -&gt;&#10;            val size = zipSizes[zip] ?: 0L&#10;            LOGGER.info(String.format(Locale.ROOT, &quot;%d/%d: %s (%s)&quot;, idx + 1, zips.size, zip, humanBytes(size)))&#10;        }&#10;&#10;        if (sequentialInZip) {&#10;            if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10;                throw ParameterException(spec.commandLine(), &quot;--sequential is supported only with -f word2vec or -f now&quot;)&#10;            }&#10;        }&#10;&#10;        if (maxThreads &gt; 1) {&#10;            val foundry = getFoundryFromZipFileNames(zips)&#10;            val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)&#10;            LOGGER.info(&quot;Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) &quot;sequential&quot; else &quot;parallel&quot;}&quot;)&#10;            processZipsWithQueue(zips, foundry, parallelism)&#10;        } else {&#10;            LOGGER.info(&quot;Processing zip files sequentially&quot;)&#10;            Arrays.stream(zips).forEachOrdered { zipFilePath -&gt;&#10;                processZipFileSequentially((zipFilePath ?: &quot;&quot;).toString(), getFoundryFromZipFileNames(zips))&#10;            }&#10;        }&#10;&#10;        if (annotationWorkerPool != null) {&#10;            LOGGER.info(&quot;closing worker pool&quot;)&#10;            annotationWorkerPool?.close()&#10;        }&#10;        // Shutdown entry executor&#10;        entryExecutor?.shutdown()&#10;    }&#10;&#10;    private fun processZipsWithQueue(zips: Array&lt;String&gt;, foundry: String, parallelism: Int) {&#10;        val queue: java.util.concurrent.BlockingQueue&lt;String&gt; = java.util.concurrent.LinkedBlockingQueue()&#10;        zips.forEach { queue.put(it) }&#10;        val executor = Executors.newFixedThreadPool(parallelism)&#10;        val active = java.util.concurrent.atomic.AtomicInteger(0)&#10;        repeat(parallelism) {&#10;            executor.submit {&#10;                active.incrementAndGet()&#10;                try {&#10;                    while (true) {&#10;                        val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS)&#10;                        if (zipPath == null) {&#10;                            if (queue.isEmpty()) break else continue&#10;                        }&#10;                        if (sequentialInZip) {&#10;                            processZipFileSequentially(zipPath, foundry)&#10;                        } else {&#10;                            processZipFile(zipPath, foundry)&#10;                        }&#10;                    }&#10;                } finally {&#10;                    active.decrementAndGet()&#10;                }&#10;            }&#10;        }&#10;        executor.shutdown()&#10;        try {&#10;            executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS)&#10;        } catch (ie: InterruptedException) {&#10;            Thread.currentThread().interrupt()&#10;        }&#10;    }&#10;&#10;    // Convert a shell-like glob to a Regex: '*' -&gt; &quot;.*&quot;, '?' -&gt; '.', anchored full match&#10;    private fun globToRegex(glob: String): Regex {&#10;        val sb = StringBuilder(&quot;^&quot;)&#10;        glob.forEach { ch -&gt;&#10;            when (ch) {&#10;                '*' -&gt; sb.append(&quot;.*&quot;)&#10;                '?' -&gt; sb.append('.')&#10;                '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -&gt; sb.append('\\').append(ch)&#10;                else -&gt; sb.append(ch)&#10;            }&#10;        }&#10;        sb.append('$')&#10;        return Regex(sb.toString())&#10;    }&#10;&#10;&#10;    private fun getTokenSpansFromMorho(morpho: MutableMap&lt;String, MorphoSpan&gt;): Array&lt;Span&gt; {&#10;        return morpho.keys.map { key -&gt;&#10;            val fromTo = key.split(&quot;-&quot;)&#10;            Span(fromTo[0].toInt(), fromTo[1].toInt())&#10;        }.sortedBy {&#10;            it.from&#10;        }.toTypedArray()&#10;    }&#10;&#10;    private fun getFoundryFromZipFileName(zipFileName: String): String {&#10;        if (!zipFileName.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) {&#10;            return &quot;base&quot;&#10;        }&#10;        return zipFileName.replace(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;), &quot;$1&quot;)&#10;    }&#10;&#10;    private fun getFoundryFromZipFileNames(zipFileNames: Array&lt;String&gt;): String {&#10;        for (zipFileName in zipFileNames) {&#10;            val foundry = getFoundryFromZipFileName(zipFileName)&#10;            if (foundry != &quot;base&quot;) {&#10;                return foundry&#10;            }&#10;        }&#10;        return &quot;base&quot;&#10;    }&#10;&#10;    private fun processZipFile(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10;        val ord = zipOrdinals[zipFilePath] ?: 0&#10;        val size = zipSizes[zipFilePath] ?: 0L&#10;        LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10;        LOGGER.info(&quot;Foundry: $foundry $dbFactory&quot;)&#10;        if (outputFormat == OutputFormat.KORAPXML &amp;&amp; dbFactory == null) {&#10;            var targetFoundry = &quot;base&quot;&#10;            if (taggerName != null) {&#10;                val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10;                if (tagger != null) {&#10;                    targetFoundry = tagger.foundry&#10;                }&#10;            } else if (parserName != null) {&#10;                targetFoundry = parserName!!&#10;            }&#10;            dbFactory = DocumentBuilderFactory.newInstance()&#10;            dBuilder = dbFactory!!.newDocumentBuilder()&#10;            val outputMorphoZipFileName =&#10;                if (parserName != null)&#10;                    zipFilePath.replace(Regex(&quot;(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$&quot;), &quot;.&quot;.plus(parserName).plus(&quot;.zip&quot;))&#10;                else&#10;                    zipFilePath.replace(Regex(&quot;\\.zip$&quot;), &quot;.&quot;.plus(targetFoundry).plus(&quot;.zip&quot;))&#10;            if (File(outputMorphoZipFileName).exists() &amp;&amp; !overwrite) {&#10;                LOGGER.severe(&quot;Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.&quot;)&#10;                exitProcess(1)&#10;            }&#10;            val fileOutputStream = FileOutputStream(outputMorphoZipFileName)&#10;            morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply {&#10;                setUseZip64(Zip64Mode.Always)&#10;            }&#10;        }&#10;        if (zipFilePath.hasCorrespondingBaseZip()) {&#10;            val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10;            // Process related zips one after another to keep the ZipFile lifetime strictly bounded&#10;            relatedZips.forEach { zip -&gt;&#10;                ZipFile(zip).use { zipFile -&gt;&#10;                    processZipEntriesWithPool(zipFile, foundry, true)&#10;                }&#10;            }&#10;        } else {&#10;            ZipFile(zipFilePath).use { zipFile -&gt;&#10;                processZipEntriesWithPool(zipFile, foundry, false)&#10;            }&#10;        }&#10;        if (outputFormat == OutputFormat.KORAPXML) {&#10;            morphoZipOutputStream!!.close()&#10;        }&#10;        logZipProgress(zipFilePath)&#10;    }&#10;&#10;    private fun processZipFileSequentially(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10;        val ord = zipOrdinals[zipFilePath] ?: 0&#10;        val size = zipSizes[zipFilePath] ?: 0L&#10;        LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10;        if (zipFilePath.hasCorrespondingBaseZip()) {&#10;            // Process the two related zips strictly sequentially to limit memory growth&#10;            val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10;            zips.forEach { zip -&gt;&#10;                ZipFile(zip).use { zipFile -&gt;&#10;                    // Iterate entries in a deterministic order to keep related files close together&#10;                    zipFile.stream()&#10;                        .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10;                        .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10;                        .forEachOrdered { zipEntry -&gt;&#10;                            processZipEntry(zipFile, foundry, zipEntry, true)&#10;                        }&#10;                }&#10;            }&#10;        } else {&#10;            ZipFile(zipFilePath).use { zipFile -&gt;&#10;                zipFile.stream()&#10;                    .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10;                    .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10;                    .forEachOrdered { zipEntry -&gt;&#10;                        processZipEntry(zipFile, foundry, zipEntry, false)&#10;                    }&#10;            }&#10;        }&#10;        logZipProgress(zipFilePath)&#10;    }&#10;&#10;    private fun logZipProgress(zipFilePath: String) {&#10;        try {&#10;            val size = zipSizes[zipFilePath] ?: 0L&#10;            val done = processedZipBytes.addAndGet(size)&#10;            val total = if (totalZipBytes &gt; 0) totalZipBytes else 1L&#10;            val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1)&#10;            val speedBytesPerSec = (done * 1000.0) / elapsedMs&#10;            val remaining = (total - done).coerceAtLeast(0)&#10;            val etaSeconds = if (speedBytesPerSec &gt; 0.0) (remaining / speedBytesPerSec).toLong() else -1L&#10;            val ord = zipOrdinals[zipFilePath] ?: 0&#10;            val pct = (done * 100.0 / total).coerceIn(0.0, 100.0)&#10;            val humanSpeed = String.format(Locale.ROOT, &quot;%.2f MB/s&quot;, speedBytesPerSec / (1024.0 * 1024.0))&#10;            val etaStr = if (etaSeconds &gt;= 0) formatDuration(etaSeconds) else &quot;unknown&quot;&#10;            LOGGER.info(&#10;                &quot;Finished zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} &quot; +&#10;                        &quot;(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, &quot;%.1f&quot;, pct)}%%, &quot; +&#10;                        &quot;ETA ${etaStr} at ${humanSpeed}&quot;&#10;            )&#10;        } catch (e: Exception) {&#10;            LOGGER.fine(&quot;Failed to log zip progress for $zipFilePath: ${e.message}&quot;)&#10;        }&#10;    }&#10;&#10;    private fun humanBytes(bytes: Long): String {&#10;        if (bytes &lt; 1024) return &quot;$bytes B&quot;&#10;        val kb = bytes / 1024.0&#10;        if (kb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f KB&quot;, kb)&#10;        val mb = kb / 1024.0&#10;        if (mb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f MB&quot;, mb)&#10;        val gb = mb / 1024.0&#10;        return String.format(Locale.ROOT, &quot;%.1f GB&quot;, gb)&#10;    }&#10;&#10;    private fun formatDuration(seconds: Long): String {&#10;        var s = seconds&#10;        val h = s / 3600; s %= 3600&#10;        val m = s / 60; val sec = s % 60&#10;        return String.format(Locale.ROOT, &quot;%02d:%02d:%02d&quot;, h, m, sec)&#10;    }&#10;&#10;    private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) {&#10;        // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested&#10;        val entries: MutableList&lt;ZipEntry&gt; = ArrayList()&#10;        val enumEntries = zipFile.entries()&#10;        while (enumEntries.hasMoreElements()) {&#10;            val e = enumEntries.nextElement()&#10;            if (extractMetadataRegex.isEmpty() &amp;&amp; e.name.contains(&quot;header.xml&quot;)) continue&#10;            entries.add(e)&#10;        }&#10;        if (entries.isEmpty()) return&#10;&#10;        // If only one thread requested, do sequential to avoid pool overhead&#10;        if (maxThreads &lt;= 1) {&#10;            entries.forEach { entry -&gt; processZipEntry(zipFile, foundry, entry, waitForMorpho) }&#10;            return&#10;        }&#10;&#10;        // Submit all entry tasks to the shared executor and await completion before closing the zip&#10;        val latch = java.util.concurrent.CountDownLatch(entries.size)&#10;        entries.forEach { entry -&gt;&#10;            entryExecutor?.execute {&#10;                try {&#10;                    processZipEntry(zipFile, foundry, entry, waitForMorpho)&#10;                } catch (t: Throwable) {&#10;                    LOGGER.warning(&quot;Failed to process entry ${entry.name}: ${t.message}&quot;)&#10;                } finally {&#10;                    latch.countDown()&#10;                }&#10;            }&#10;        }&#10;        try {&#10;            latch.await()&#10;        } catch (ie: InterruptedException) {&#10;            Thread.currentThread().interrupt()&#10;        }&#10;    }&#10;&#10;    fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {&#10;        var foundry = _foundry&#10;        var waitForMorpho = passedWaitForMorpho&#10;        LOGGER.finer(&quot;Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}&quot;)&#10;        if (taggerName != null &amp;&amp; !taggerToolBridges.containsKey(Thread.currentThread().threadId())) {&#10;            val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10;            taggerToolBridges[Thread.currentThread().threadId()] = tagger&#10;            if (tagger != null) {&#10;                foundry = tagger.foundry&#10;            }&#10;&#10;        }&#10;        if (parserName != null &amp;&amp; !parserToolBridges.containsKey(Thread.currentThread().threadId())) {&#10;            val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?&#10;            parserToolBridges[Thread.currentThread().threadId()] = parser&#10;            if (parser != null) {&#10;                foundry = &quot;$foundry dependency:${parser.foundry}&quot;&#10;                LOGGER.fine(&quot;Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}&quot;)&#10;            }&#10;        }&#10;&#10;        try {&#10;            if (zipEntry.name.matches(Regex(&quot;.*(data|tokens|structure|morpho)\\.xml$&quot;))) {&#10;                // Ensure the entry stream and reader are closed to avoid native memory buildup&#10;                val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()&#10;                val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()&#10;                // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure&#10;                if (lemmaOnly &amp;&amp; zipEntry.name.endsWith(&quot;data.xml&quot;)) {&#10;                    return&#10;                }&#10;                val doc: Document = try {&#10;                    zipFile.getInputStream(zipEntry).use { inputStream -&gt;&#10;                        XMLCommentFilterReader(inputStream, &quot;UTF-8&quot;).use { reader -&gt;&#10;                            dBuilder.parse(InputSource(reader))&#10;                        }&#10;                    }&#10;                } catch (e: SAXParseException) {&#10;                    LOGGER.warning(&quot;Error parsing file: &quot; + zipEntry.name + &quot; &quot; + e.message)&#10;                    return&#10;                }&#10;&#10;                doc.documentElement.normalize()&#10;                val docId: String = doc.documentElement.getAttribute(&quot;docid&quot;)&#10;                if (siglePattern != null &amp;&amp; !Regex(siglePattern!!).containsMatchIn(docId)) {&#10;                    return&#10;                }&#10;                // LOGGER.info(&quot;Processing file: &quot; + zipEntry.getName())&#10;                val fileName = zipEntry.name.replace(Regex(&quot;.*?/([^/]+\\.xml)$&quot;), &quot;$1&quot;)&#10;                when (fileName) {&#10;                    &quot;data.xml&quot; -&gt; {&#10;                        if (!lemmaOnly) {&#10;                            val textsList: NodeList = doc.getElementsByTagName(&quot;text&quot;)&#10;                            if (textsList.length &gt; 0) {&#10;                                texts[docId] = NonBmpString(textsList.item(0).textContent)&#10;                            }&#10;                        }&#10;                    }&#10;&#10;                    &quot;structure.xml&quot; -&gt; {&#10;                        val spans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        if (extractAttributesRegex.isNotEmpty())&#10;                            extraFeatures[docId] = extractMiscSpans(spans)&#10;                        sentences[docId] = extractSentenceSpans(spans)&#10;&#10;                    }&#10;&#10;                    &quot;tokens.xml&quot; -&gt; {&#10;                        if (!fnames.contains(docId)) {&#10;                            fnames[docId] = zipEntry.name&#10;                        }&#10;                        val tokenSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        tokens[docId] = extractSpans(tokenSpans)&#10;                    }&#10;&#10;                    &quot;morpho.xml&quot; -&gt; {&#10;                        waitForMorpho = true&#10;                        fnames[docId] = zipEntry.name&#10;                        val fsSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10;                        morpho[docId] = extractMorphoSpans(fsSpans)&#10;                        tokens[docId] = extractSpans(fsSpans)&#10;                    }&#10;                }&#10;&#10;                val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10;                // For lemma-only/lemma-based word2vec/now, we can proceed without full text&#10;                val textRequired = when (outputFormat) {&#10;                    OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10;                    else -&gt; true&#10;                }&#10;                if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10;                    &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10;                    &amp;&amp; (extractMetadataRegex.isEmpty() || metadata[docId] != null)&#10;                ) {&#10;                    // Be quiet on INFO; per-text logs only on FINE and below&#10;                    LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                    processText(docId, foundry)&#10;                }&#10;            } else if (extractMetadataRegex.isNotEmpty() &amp;&amp; zipEntry.name.matches(Regex(&quot;.*/header\\.xml$&quot;))) {&#10;                //LOGGER.info(&quot;Processing header file: &quot; + zipEntry.name)&#10;                val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }&#10;                val docId =&#10;                    Regex(&quot;&lt;textSigle&gt;([^&lt;]+)&lt;/textSigle&gt;&quot;).find(text)?.destructured?.component1()&#10;                        ?.replace(Regex(&quot;/&quot;), &quot;_&quot;)&#10;                LOGGER.fine(&quot;Processing header file: &quot; + zipEntry.name + &quot; docId: &quot; + docId)&#10;                val meta = ArrayList&lt;String&gt;()&#10;                extractMetadataRegex.forEach { regex -&gt;&#10;                    val match = Regex(regex).find(text)&#10;                    if (match != null) {&#10;                        meta.add(match.destructured.component1())&#10;                    }&#10;                }&#10;                if (meta.isNotEmpty() &amp;&amp; docId != null) {&#10;                    metadata[docId] = meta.toTypedArray()&#10;                    val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10;                    val textRequired = when (outputFormat) {&#10;                        OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10;                        else -&gt; true&#10;                    }&#10;                    if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10;                         &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10;                     ) {&#10;                        // Be quiet on INFO; per-text logs only on FINE and below&#10;                        LOGGER.fine(&quot;Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                        processText(docId, foundry)&#10;                    }&#10;                }&#10;            }&#10;        } catch (e: Exception) {&#10;            e.printStackTrace()&#10;        }&#10;    }&#10;&#10;    private fun processText(&#10;        docId: String,&#10;        foundry: String,&#10;    ) {&#10;        LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;        var morphoFoundry = getMorphoFoundry()&#10;        val output =&#10;        if (outputFormat == OutputFormat.WORD2VEC) {&#10;            lmTrainingOutput(docId)&#10;        } else if (outputFormat == OutputFormat.NOW) {&#10;            nowOutput(docId)&#10;        } else {&#10;            if (taggerToolBridges[Thread.currentThread().threadId()] != null) {&#10;                morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText(&#10;                    tokens[docId]!!,&#10;                    sentences[docId],&#10;                    texts[docId]!!&#10;                )&#10;&#10;            }&#10;            if (parserToolBridges[Thread.currentThread().threadId()] != null) {&#10;                if (morpho[docId] == null) {&#10;                    LOGGER.severe(&quot;No morpho data for $docId&quot;)&#10;                    //exitProcess(1)&#10;                }&#10;                LOGGER.finer(&quot;Parsing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;                morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText(&#10;                    tokens[docId]!!,&#10;                    morpho[docId],&#10;                    sentences[docId],&#10;                    texts[docId]!!&#10;                )&#10;                LOGGER.finer(&quot;Parsed text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10;            }&#10;            if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotationWorkerPool == null) {&#10;                korapXmlOutput(getMorphoFoundry(), docId)&#10;            } else {&#10;                conlluOutput(foundry, docId)&#10;            }&#10;        }&#10;&#10;        if (annotationWorkerPool != null) {&#10;            annotationWorkerPool?.pushToQueue(output.append(&quot;\n# eot\n&quot;).toString())&#10;            // Release internal char[] early&#10;            output.setLength(0)&#10;        } else if (outputFormat != OutputFormat.KORAPXML) {&#10;            synchronized(System.out) {&#10;                println(output.toString())&#10;            }&#10;            // Release internal char[] early&#10;            output.setLength(0)&#10;        } else {&#10;            korapXmlOutput(foundry, docId)&#10;        }&#10;&#10;&#10;        arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -&gt;&#10;            if (map === morpho) {&#10;                // Clear inner map to release references early&#10;                morpho[docId]?.clear()&#10;            }&#10;            map.remove(docId)&#10;        }&#10;&#10;        // Periodic GC hint after processing many docs (lightweight safeguard)&#10;        if ((processedDocs.incrementAndGet() % 2000) == 0) {&#10;            LOGGER.fine(&quot;Processed ${processedDocs.get()} docs – requesting GC hint&quot;)&#10;            System.gc()&#10;        }&#10;        // Memory / cache statistics logging&#10;        if (memStatsInterval &gt; 0) {&#10;            val count = processedDocs.get()&#10;            if (count % memStatsInterval == 0) {&#10;                logMemoryStats(count)&#10;            }&#10;        }&#10;&#10;        if (outputFormat == OutputFormat.KORAPXML) {&#10;            val entryPath = if (parserName != null)  docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$parserName/&quot;).plus(&quot;dependency.xml&quot;)&#10;            else&#10;                docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$morphoFoundry/&quot;).plus(&quot;morpho.xml&quot;)&#10;            val zipEntry = ZipArchiveEntry(entryPath)&#10;            zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE&#10;            synchronized(morphoZipOutputStream!!) {&#10;                morphoZipOutputStream!!.putArchiveEntry(zipEntry)&#10;                morphoZipOutputStream!!.write(output.toString().toByteArray())&#10;                morphoZipOutputStream!!.closeArchiveEntry()&#10;            }&#10;            output.clear()&#10;        }&#10;    }&#10;&#10;    private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: &quot;base&quot;&#10;&#10;    private fun logMemoryStats(count: Int) {&#10;        try {&#10;            val rt = Runtime.getRuntime()&#10;            val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024)&#10;            val total = rt.totalMemory() / (1024 * 1024)&#10;            val max = rt.maxMemory() / (1024 * 1024)&#10;            LOGGER.info(&#10;                &quot;MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} &quot; +&#10;                        &quot;maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}&quot;&#10;            )&#10;        } catch (e: Exception) {&#10;            LOGGER.warning(&quot;Failed to log memory stats: ${e.message}&quot;)&#10;        }&#10;    }&#10;&#10;    private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {&#10;        val doc: Document = dBuilder!!.newDocument()&#10;&#10;        // Root element&#10;        val layer = doc.createElement(&quot;layer&quot;)&#10;        layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10;        layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10;        layer.setAttribute(&quot;docid&quot;, docId)&#10;        doc.appendChild(layer)&#10;&#10;        val spanList = doc.createElement(&quot;spanList&quot;)&#10;        layer.appendChild(spanList)&#10;&#10;        var i = 0&#10;        var s = 0&#10;        var n = 0&#10;        val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split(&quot;-&quot;)[0].toInt() }&#10;&#10;        sortedKeys?.forEach { spanString -&gt;&#10;            val mfs = morpho[docId]?.get(spanString)&#10;            val offsets = spanString.split(&quot;-&quot;)&#10;            if(offsets.size != 2) {&#10;                LOGGER.warning(&quot;Invalid span: $spanString in $docId&quot;)&#10;                return@forEach&#10;            }&#10;            if (offsets[0].toInt() &gt; sentences[docId]!!.elementAt(s).to) {&#10;                s++&#10;                n = i&#10;            }&#10;            i++&#10;            if (mfs!!.deprel == &quot;_&quot;) {&#10;                return@forEach&#10;            }&#10;&#10;            val spanNode = doc.createElement(&quot;span&quot;)&#10;            spanNode.setAttribute(&quot;id&quot;, &quot;s${s + 1}_n${i - n}&quot;)&#10;            spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10;            spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10;            // rel element&#10;            val rel = doc.createElement(&quot;rel&quot;)&#10;            rel.setAttribute(&quot;label&quot;, mfs.deprel)&#10;&#10;            // inner span element&#10;            val innerSpan = doc.createElement(&quot;span&quot;)&#10;            val headInt = if(mfs.head == &quot;_&quot;) 0 else parseInt(mfs.head) - 1&#10;            if (headInt &lt; 0) {&#10;                innerSpan.setAttribute(&quot;from&quot;, sentences[docId]!!.elementAt(s).from.toString())&#10;                innerSpan.setAttribute(&quot;to&quot;,  sentences[docId]!!.elementAt(s).to.toString())&#10;            } else {&#10;                if (headInt + n &gt;= morpho[docId]!!.size) {&#10;                    LOGGER.warning(&quot;Head index out of bounds: ${headInt+n} &gt;= ${morpho[docId]!!.size} in $docId&quot;)&#10;                    return@forEach&#10;                } else {&#10;                    val destSpanString = sortedKeys.elementAt(headInt + n)&#10;                    val destOffsets = destSpanString.split(&quot;-&quot;)&#10;                    innerSpan.setAttribute(&quot;from&quot;, destOffsets[0])&#10;                    innerSpan.setAttribute(&quot;to&quot;, destOffsets[1])&#10;                }&#10;            }&#10;            rel.appendChild(innerSpan)&#10;            spanNode.appendChild(rel)&#10;            spanList.appendChild(spanNode)&#10;        }&#10;        val transformerFactory = TransformerFactory.newInstance()&#10;        val transformer = transformerFactory.newTransformer()&#10;        transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10;        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10;        transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10;        val domSource = DOMSource(doc)&#10;        val streamResult = StreamResult(StringWriter())&#10;        transformer.transform(domSource, streamResult)&#10;&#10;        return StringBuilder(streamResult.writer.toString())&#10;    }&#10;&#10;    private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {&#10;        return if (parserName != null) {&#10;            korapXmlDependencyOutput(foundry, docId)&#10;        } else {&#10;            korapXmlMorphoOutput(foundry, docId)&#10;        }&#10;    }&#10;&#10;    private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {&#10;            val doc: Document = dBuilder!!.newDocument()&#10;&#10;        // Root element&#10;        val layer = doc.createElement(&quot;layer&quot;)&#10;        layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10;        layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10;        layer.setAttribute(&quot;docid&quot;, docId)&#10;        doc.appendChild(layer)&#10;&#10;        val spanList = doc.createElement(&quot;spanList&quot;)&#10;        layer.appendChild(spanList)&#10;&#10;        var i = 0&#10;        morpho[docId]?.forEach { (spanString, mfs) -&gt;&#10;            i++&#10;            val offsets = spanString.split(&quot;-&quot;)&#10;            val spanNode = doc.createElement(&quot;span&quot;)&#10;            spanNode.setAttribute(&quot;id&quot;, &quot;t_$i&quot;)&#10;            spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10;            spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10;            // fs element&#10;            val fs = doc.createElement(&quot;fs&quot;)&#10;            fs.setAttribute(&quot;type&quot;, &quot;lex&quot;)&#10;            fs.setAttribute(&quot;xmlns&quot;, &quot;http://www.tei-c.org/ns/1.0&quot;)&#10;            spanNode.appendChild(fs)&#10;            val f = doc.createElement(&quot;f&quot;)&#10;            f.setAttribute(&quot;name&quot;, &quot;lex&quot;)&#10;            fs.appendChild(f)&#10;&#10;            // Inner fs element&#10;            val innerFs = doc.createElement(&quot;fs&quot;)&#10;            f.appendChild(innerFs)&#10;&#10;            if (mfs.lemma != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;lemma&quot;)&#10;                innerF.textContent = mfs.lemma&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.upos != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;upos&quot;)&#10;                innerF.textContent = mfs.upos&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.xpos != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;pos&quot;)&#10;                innerF.textContent = mfs.xpos&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.feats != &quot;_&quot;) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;msd&quot;)&#10;                innerF.textContent = mfs.feats&#10;                innerFs.appendChild(innerF)&#10;            }&#10;            if (mfs.misc != &quot;_&quot; &amp;&amp; mfs.misc!!.matches(Regex(&quot;^[0-9.]+$&quot;))) {&#10;                val innerF = doc.createElement(&quot;f&quot;)&#10;                innerF.setAttribute(&quot;name&quot;, &quot;certainty&quot;)&#10;                innerF.textContent = mfs.misc&#10;                innerFs.appendChild(innerF)&#10;            }&#10;&#10;            spanList.appendChild(spanNode)&#10;        }&#10;        val transformerFactory = TransformerFactory.newInstance()&#10;        val transformer = transformerFactory.newTransformer()&#10;        transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10;        transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10;        transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10;        val domSource = DOMSource(doc)&#10;        val streamResult = StreamResult(StringWriter())&#10;        transformer.transform(domSource, streamResult)&#10;&#10;        return StringBuilder(streamResult.writer.toString())&#10;&#10;    }&#10;&#10;    private fun conlluOutput(foundry: String, docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder&#10;        output =&#10;            StringBuilder(&quot;# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n&quot;).append(&#10;                tokenOffsetsInSentence(&#10;                    sentences, docId, sentence_index, real_token_index, tokens&#10;                )&#10;            )&#10;        if (extractMetadataRegex.isNotEmpty()) {&#10;            output.append(metadata[docId]?.joinToString(&quot;\t&quot;, prefix = &quot;# metadata=&quot;, postfix = &quot;\n&quot;) ?: &quot;&quot;)&#10;        }&#10;        var previousSpanStart = 0&#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to) {&#10;                output.append(&quot;\n&quot;)&#10;                sentence_index++&#10;                token_index = 1&#10;                output.append(&#10;                    tokenOffsetsInSentence(&#10;                        sentences, docId, sentence_index, real_token_index, tokens&#10;                    )&#10;                )&#10;            }&#10;            if (extractAttributesRegex.isNotEmpty() &amp;&amp; extraFeatures[docId] != null) {&#10;                for (i in previousSpanStart until span.from + 1) {&#10;                    if (extraFeatures[docId]?.containsKey(&quot;$i&quot;) == true) {&#10;                        output.append(extraFeatures[docId]!![&quot;$i&quot;])&#10;                        extraFeatures[docId]!!.remove(&quot;$i&quot;)&#10;                    }&#10;                }&#10;                previousSpanStart = span.from + 1&#10;            }&#10;            if (morpho[docId]?.containsKey(&quot;${span.from}-${span.to}&quot;) == true) {&#10;                val mfs = morpho[docId]!![&quot;${span.from}-${span.to}&quot;]&#10;                if (span.to &gt; texts[docId]!!.length) {&#10;                    span.to = texts[docId]!!.length&#10;                    LOGGER.warning(&#10;                        &quot;Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${&#10;                            texts[docId]!!.substring(&#10;                                span.from,&#10;                                span.to&#10;                            )&#10;                        }&quot;&#10;                    )&#10;                }&#10;                output.append(&#10;                    printConlluToken(&#10;                        token_index,&#10;                        texts[docId]!!.substring(span.from, span.to),&#10;                        mfs!!.lemma!!,&#10;                        mfs.upos!!,&#10;                        mfs.xpos!!,&#10;                        mfs.feats!!,&#10;                        mfs.head!!,&#10;                        mfs.deprel!!,&#10;                        mfs.deps!!,&#10;                        mfs.misc!!,&#10;                        columns&#10;                    )&#10;                )&#10;            } else {&#10;                output.append(&#10;                    printConlluToken(&#10;                        token_index, texts[docId]!!.substring(span.from, span.to), columns = columns&#10;                    )&#10;                )&#10;            }&#10;            real_token_index++&#10;        }&#10;        return output&#10;    }&#10;&#10;    private fun lmTrainingOutput(docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder&#10;        output = StringBuilder()&#10;        if (extractMetadataRegex.isNotEmpty()) {&#10;            output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10;        }&#10;        // If no text is available (e.g., lemma-only mode), emit lemmas&#10;        if (texts[docId] == null) {&#10;            tokens[docId]?.forEach { span -&gt;&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]?.get(key)?.lemma&#10;                output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10;            }&#10;            if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)&#10;            return output&#10;        }&#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                if (output.isNotEmpty()) {&#10;                    output.setCharAt(output.length - 1, '\n')&#10;                } else {&#10;                    output.append(&quot;\n&quot;)&#10;                }&#10;                if (extractMetadataRegex.isNotEmpty() &amp;&amp; real_token_index &lt; tokens[docId]!!.size - 1) {&#10;                    output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10;                }&#10;                sentence_index++&#10;            }&#10;            // Bounds safety&#10;            val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10;            val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10;            if (useLemma &amp;&amp; morpho[docId] != null) {&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]!![key]?.lemma&#10;                if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10;                    output.append(lemmaVal)&#10;                    output.append(' ')&#10;                } else {&#10;                    texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                    output.append(' ')&#10;                }&#10;            } else {&#10;                texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                output.append(' ')&#10;            }&#10;            real_token_index++&#10;        }&#10;        if (output.isNotEmpty()) {&#10;            output.deleteCharAt(output.length - 1)&#10;        }&#10;        return output&#10;    }&#10;&#10;    private fun nowOutput(docId: String): StringBuilder {&#10;        var token_index = 0&#10;        var real_token_index = 0&#10;        var sentence_index = 0&#10;        val output: StringBuilder = StringBuilder()&#10;        &#10;        // Add the text sigle prefix&#10;        output.append(&quot;@@$docId &quot;)&#10;        &#10;        if (texts[docId] == null) {&#10;            // Lemma-only fallback when original text is not loaded&#10;            tokens[docId]?.forEach { span -&gt;&#10;                if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                    if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10;                        output.append(&quot; &lt;p&gt; &quot;)&#10;                    }&#10;                    sentence_index++&#10;                }&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]?.get(key)?.lemma&#10;                output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10;            }&#10;            if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10;                output.deleteCharAt(output.length - 1)&#10;            }&#10;            return output&#10;        }&#10;        &#10;        tokens[docId]?.forEach { span -&gt;&#10;            token_index++&#10;            if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10;                // Replace sentence end with &lt;p&gt; tag instead of newline&#10;                if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10;                    output.append(&quot; &lt;p&gt; &quot;)&#10;                }&#10;                sentence_index++&#10;            }&#10;            // Bounds safety&#10;            val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10;            val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10;            if (useLemma &amp;&amp; morpho[docId] != null) {&#10;                val key = &quot;${span.from}-${span.to}&quot;&#10;                val lemmaVal = morpho[docId]!![key]?.lemma&#10;                if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10;                    output.append(lemmaVal)&#10;                    output.append(' ')&#10;                } else {&#10;                    texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                    output.append(' ')&#10;                }&#10;            } else {&#10;                texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10;                output.append(' ')&#10;            }&#10;            real_token_index++&#10;        }&#10;        &#10;        // Remove trailing space and add final newline&#10;        if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10;            output.deleteCharAt(output.length - 1)&#10;        }&#10;        &#10;        return output&#10;    }&#10;&#10;&#10;    private fun printConlluToken(&#10;        token_index: Int,&#10;        token: String,&#10;        lemma: String = &quot;_&quot;,&#10;        upos: String = &quot;_&quot;,&#10;        xpos: String = &quot;_&quot;,&#10;        feats: String = &quot;_&quot;,&#10;        head: String = &quot;_&quot;,&#10;        deprel: String = &quot;_&quot;,&#10;        deps: String = &quot;_&quot;,&#10;        misc: String = &quot;_&quot;,&#10;        columns: Int = 10&#10;    ): String {&#10;        val myUpos = if (COMPATIBILITY_MODE &amp;&amp; upos == &quot;_&quot;) xpos else upos&#10;        return when (columns) {&#10;            1 -&gt; (&quot;$token\n&quot;)&#10;            10 -&gt; (&quot;$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator&quot;)&#10;            else -&gt; {&#10;                val fields = listOf(&#10;                    token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc&#10;                )&#10;                fields.subList(0, min(columns, 10)).joinToString(&quot;\t&quot;, postfix = tokenSeparator)&#10;            }&#10;        }&#10;    }&#10;&#10;    private fun tokenOffsetsInSentence(&#10;        sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;,&#10;        docId: String,&#10;        sentence_index: Int,&#10;        token_index: Int,&#10;        tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;&#10;    ): String {&#10;        if (sentences[docId] == null || sentences[docId]!!.size &lt;= sentence_index) {&#10;            return &quot;&quot;&#10;        }&#10;        val sentenceEndOffset = sentences[docId]!![sentence_index].to&#10;        var i = token_index&#10;        val start_offsets_string = StringBuilder()&#10;        val end_offsets_string = StringBuilder()&#10;        while (tokens[docId] != null &amp;&amp; i &lt; tokens[docId]!!.size &amp;&amp; tokens[docId]!![i].to &lt;= sentenceEndOffset) {&#10;            start_offsets_string.append(&quot; &quot;, tokens[docId]!![i].from)&#10;            end_offsets_string.append(&quot; &quot;, tokens[docId]!![i].to)&#10;            i++&#10;        }&#10;        return (&#10;                StringBuilder() .append(&#10;                    &quot;# start_offsets = &quot;, tokens[docId]!![token_index].from, start_offsets_string, &quot;\n&quot;,&#10;                    &quot;# end_offsets = &quot;, sentenceEndOffset, end_offsets_string, &quot;\n&quot;&#10;                ).toString())&#10;    }&#10;&#10;    private fun extractSpans(spans: NodeList): Array&lt;Span&gt; {&#10;        val list = ArrayList&lt;Span&gt;()&#10;        IntStream.range(0, spans.length).forEach { idx -&gt;&#10;            val node = spans.item(idx)&#10;            if (node is Element) {&#10;                val fromAttr = node.getAttribute(&quot;from&quot;)&#10;                val toAttr = node.getAttribute(&quot;to&quot;)&#10;                if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {&#10;                    LOGGER.warning(&quot;Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'&quot;)&#10;                } else {&#10;                    try {&#10;                        val from = Integer.parseInt(fromAttr)&#10;                        val to = Integer.parseInt(toAttr)&#10;                        list.add(Span(from, to))&#10;                    } catch (e: NumberFormatException) {&#10;                        LOGGER.warning(&quot;Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}&quot;)&#10;                    }&#10;                }&#10;            }&#10;        }&#10;        return list.toTypedArray()&#10;    }&#10;&#10;    private fun extractMorphoSpans(&#10;        fsSpans: NodeList&#10;    ): MutableMap&lt;String, MorphoSpan&gt; {&#10;        val UNKNOWN = Regex(&quot;(UNKNOWN|&lt;unknown&gt;)&quot;)&#10;        val res: MutableMap&lt;String, MorphoSpan&gt; = HashMap()&#10;        IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -&gt; node is Element &amp;&amp; node.getAttribute(&quot;type&quot;) != &quot;alt&quot; }.forEach { node -&gt;&#10;                val features = (node as Element).getElementsByTagName(&quot;f&quot;)&#10;                val fs = MorphoSpan()&#10;                val fromTo = &quot;${node.getAttribute(&quot;from&quot;)}-${node.getAttribute(&quot;to&quot;)}&quot;&#10;                IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -&gt;&#10;                        val attr = (feature as Element).getAttribute(&quot;name&quot;)&#10;                        val value = feature.textContent.trim()&#10;                        if (value.isEmpty()) return@forEach&#10;                        when (attr) {&#10;                            &quot;lemma&quot; -&gt; if(fs.lemma == &quot;_&quot;) fs.lemma = value.replace(UNKNOWN, &quot;--&quot;)&#10;                            &quot;upos&quot; -&gt; fs.upos = value&#10;                            &quot;xpos&quot;, &quot;ctag&quot;, &quot;pos&quot; -&gt; if(fs.xpos == &quot;_&quot;) fs.xpos = value.replace(UNKNOWN, &quot;--&quot;)&#10;                            &quot;feats&quot;, &quot;msd&quot; -&gt; if(fs.feats == &quot;_&quot; ) fs.feats = value&#10;                            &quot;type&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats = feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent.trim()&#10;                            // &quot;subtype&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats += &quot;:&quot; + feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent&#10;                            &quot;certainty&quot; -&gt; if(fs.misc == &quot;_&quot;) fs.misc = value&#10;                        }&#10;                    }&#10;                res[fromTo] = fs&#10;            }&#10;        return res&#10;    }&#10;&#10;    private fun extractSentenceSpans(spans: NodeList): Array&lt;Span&gt; {&#10;        return IntStream.range(0, spans.length).mapToObj(spans::item)&#10;            .filter { node -&gt; node is Element &amp;&amp; node.getElementsByTagName(&quot;f&quot;).item(0).textContent.equals(&quot;s&quot;) }&#10;            .map { node -&gt;&#10;                Span(&#10;                    Integer.parseInt((node as Element).getAttribute(&quot;from&quot;)), Integer.parseInt(node.getAttribute(&quot;to&quot;))&#10;                )&#10;            }.toArray { size -&gt; arrayOfNulls(size) }&#10;    }&#10;&#10;    /*&#10;     &lt;span id=&quot;s15&quot; from=&quot;370&quot; to=&quot;394&quot; l=&quot;5&quot;&gt;&#10;      &lt;fs type=&quot;struct&quot; xmlns=&quot;http://www.tei-c.org/ns/1.0&quot;&gt;&#10;        &lt;f name=&quot;name&quot;&gt;posting&lt;/f&gt;&#10;        &lt;f name=&quot;attr&quot;&gt;&#10;          &lt;fs type=&quot;attr&quot;&gt;&#10;            &lt;f name=&quot;id&quot;&gt;i.10894_1_3&lt;/f&gt;&#10;            &lt;f name=&quot;indentLevel&quot;&gt;0&lt;/f&gt;&#10;            &lt;f name=&quot;who&quot;&gt;WU00000000&lt;/f&gt;&#10;          &lt;/fs&gt;&#10;        &lt;/f&gt;&#10;      &lt;/fs&gt;&#10;    &lt;/span&gt;&#10;&#10;     */&#10;    private fun extractMiscSpans(spans: NodeList): MutableMap&lt;String, String&gt; {&#10;        val miscLocal: MutableMap&lt;String, String&gt; = HashMap()&#10;&#10;        IntStream.range(0, spans.length).mapToObj(spans::item)&#10;            .filter { node -&gt;&#10;                node is Element&#10;                        &amp;&amp; node.getElementsByTagName(&quot;f&quot;).length &gt; 1&#10;                        &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(0) as Element).getAttribute(&quot;name&quot;).equals(&quot;name&quot;)&#10;                        &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getAttribute(&quot;name&quot;).equals(&quot;attr&quot;)&#10;            }&#10;            .forEach { node -&gt;&#10;                if (node == null) return@forEach&#10;                val elementName = (node as Element).getElementsByTagName(&quot;f&quot;).item(0).textContent.trim()&#10;                val from = node.getAttribute(&quot;from&quot;)&#10;                val attributes = (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getElementsByTagName(&quot;f&quot;)&#10;                val res = StringBuilder()&#10;                IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -&gt;&#10;                    val attrName = &quot;$elementName/${(attr as Element).getAttribute(&quot;name&quot;)}&quot;&#10;                    if (attrName.matches(Regex(extractAttributesRegex))) {&#10;                         res.append(&quot;# $attrName = ${attr.textContent}\n&quot;)&#10;                        //LOGGER.info(&quot;&quot; + from + &quot;: $attrName = &quot; + attr.textContent)&#10;                    }&#10;&#10;                }&#10;                if (res.isNotEmpty()) {&#10;                    if (miscLocal.containsKey(from)) {&#10;                        // LOGGER.info(&quot;ADDING TO $from: ${miscLocal[from]}&quot;)&#10;                        miscLocal[from] += res.toString()&#10;                    } else {&#10;                        miscLocal[from] = res.toString()&#10;                    }&#10;                }&#10;            }&#10;        return miscLocal&#10;    }&#10;&#10;&#10;    class Span(var from: Int, var to: Int)&#10;&#10;    class MorphoSpan(&#10;        var lemma: String? = &quot;_&quot;,&#10;        var upos: String? = &quot;_&quot;,&#10;        var xpos: String? = &quot;_&quot;,&#10;        var feats: String? = &quot;_&quot;,&#10;        var head: String? = &quot;_&quot;,&#10;        var deprel: String? = &quot;_&quot;,&#10;        var deps: String? = &quot;_&quot;,&#10;        var misc: String? = &quot;_&quot;&#10;    )&#10;&#10;}&#10;&#10;fun main(args: Array&lt;String&gt;): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args))&#10;&#10;fun debug(args: Array&lt;String&gt;): Int {&#10;    return (CommandLine(KorapXmlTool()).execute(*args))&#10;}&#10;&#10;enum class OutputFormat {&#10;    CONLLU, WORD2VEC, KORAPXML, NOW&#10;}&#10;&#10;object ConlluOutputFormat {&#10;    const val NAME = &quot;conllu&quot;&#10;}&#10;&#10;object Word2VecOutputFormat {&#10;    const val NAME = &quot;word2vec&quot;&#10;}&#10;&#10;object KorapXmlOutputFormat {&#10;    const val NAME = &quot;korapxml&quot;&#10;}&#10;&#10;object NowOutputFormat {&#10;    const val NAME = &quot;now&quot;&#10;}" />
             </PendingDiffInfo>
           </value>
         </entry>
         <entry key="$PROJECT_DIR$/build.gradle">
           <value>
             <PendingDiffInfo>
               <option name="filePath" value="$PROJECT_DIR$/build.gradle" />
               <option name="originalContent" value="repositories {&#10;    flatDir {&#10;        dirs(&quot;libs&quot;)&#10;    }&#10;}&#10;" />
               <option name="updatedContent" value="repositories {&#10;    flatDir {&#10;        dirs(&quot;libs&quot;)&#10;    }&#10;}&#10;&#10;// Zentrale Projektversion für korapxmltool&#10;version = '2.0-beta-02'" />
             </PendingDiffInfo>
           </value>
         </entry>
       </map>
     </option>
   </component>
 </project>