blob: f504029475ac385aac94af90974663a95a04fa93 [file] [log] [blame]
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CopilotDiffPersistence">
<option name="pendingDiffs">
<map>
<entry key="$PROJECT_DIR$/app/build.gradle">
<value>
<PendingDiffInfo>
<option name="filePath" value="$PROJECT_DIR$/app/build.gradle" />
<option name="originalContent" value="plugins {&#10; // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.&#10; id 'org.jetbrains.kotlin.jvm' version '2.2.21'&#10;&#10; // Apply the application plugin to add support for building a CLI application in Java.&#10; id 'application'&#10; id 'com.github.johnrengelman.shadow' version '8.1.1'&#10;}&#10;&#10;repositories {&#10; mavenCentral()&#10; maven { url 'https://jitpack.io' }&#10;}&#10;&#10;test {&#10; minHeapSize = &quot;512m&quot;&#10; maxHeapSize = &quot;4096m&quot;&#10; jvmArgs '-XX:MaxMetaspaceSize=1024m'&#10;}&#10;&#10;dependencies {&#10; // Align versions of all Kotlin components&#10; implementation platform('org.jetbrains.kotlin:kotlin-bom')&#10;&#10; // Use the Kotlin JDK 8 standard library.&#10; implementation 'org.jetbrains.kotlin:kotlin-stdlib'&#10; implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2'&#10;&#10; // This dependency is used by the application.&#10; implementation 'com.google.guava:guava:33.5.0-jre'&#10;&#10;&#10; implementation (&quot;info.picocli:picocli:4.7.7&quot;)&#10;&#10; // Use the Kotlin test library.&#10; testImplementation 'org.jetbrains.kotlin:kotlin-test'&#10;&#10; // Use the Kotlin JUnit integration.&#10; testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'&#10; testImplementation &quot;org.jetbrains.kotlin:kotlin-test:2.2.21&quot;&#10;&#10; implementation 'com.github.kupietz:cistern:v1.0.4'&#10; implementation 'org.maltparser:maltparser:1.9.2'&#10; implementation 'org.apache.opennlp:opennlp-tools:2.5.6'&#10; implementation 'org.slf4j:slf4j-simple:2.0.17'&#10; implementation 'org.apache.ant:ant:1.10.15'&#10; implementation 'org.apache.commons:commons-compress:1.28.0'&#10;&#10;}&#10;&#10;// Erzwinge JDK 21 Toolchain und Bytecode-Level 21&#10;java {&#10; toolchain {&#10; languageVersion = JavaLanguageVersion.of(21)&#10; }&#10;}&#10;&#10;kotlin {&#10; jvmToolchain(21)&#10;}&#10;&#10;// Für evtl. vorhandenen Java-Quellcode&#10;tasks.withType(JavaCompile).configureEach {&#10; options.release = 21&#10;}&#10;&#10;tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach {&#10; kotlinOptions {&#10; jvmTarget = &quot;21&quot;&#10; // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release&#10; // freeCompilerArgs += [&quot;-Xjdk-release=21&quot;]&#10; }&#10;}&#10;&#10;application {&#10; // Define the main class for the application.&#10; mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt'&#10;}&#10;&#10;jar {&#10; // Will include every single one of your dependencies, project or not&#10; // def lowerCasedName = baseName.toLowerCase()&#10; // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)&#10;&#10; manifest.attributes(&#10; 'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),&#10; 'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;&#10; )&#10; shadowJar {&#10; archiveBaseName.set('korapxmltool')&#10; archiveClassifier.set('')&#10; archiveVersion.set('')&#10; }&#10;}&#10;&#10;&#10;configurations {&#10; runtimeLib.extendsFrom implementation&#10;}&#10;" />
<option name="updatedContent" value="plugins {&#10; // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin.&#10; id 'org.jetbrains.kotlin.jvm' version '2.2.21'&#10;&#10; // Apply the application plugin to add support for building a CLI application in Java.&#10; id 'application'&#10; id 'com.github.johnrengelman.shadow' version '8.1.1'&#10;}&#10;&#10;repositories {&#10; mavenCentral()&#10; maven { url 'https://jitpack.io' }&#10;}&#10;&#10;test {&#10; minHeapSize = &quot;512m&quot;&#10; maxHeapSize = &quot;4096m&quot;&#10; jvmArgs '-XX:MaxMetaspaceSize=1024m'&#10;}&#10;&#10;dependencies {&#10; // Align versions of all Kotlin components&#10; implementation platform('org.jetbrains.kotlin:kotlin-bom')&#10;&#10; // Use the Kotlin JDK 8 standard library.&#10; implementation 'org.jetbrains.kotlin:kotlin-stdlib'&#10; implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2'&#10;&#10; // This dependency is used by the application.&#10; implementation 'com.google.guava:guava:33.5.0-jre'&#10;&#10;&#10; implementation (&quot;info.picocli:picocli:4.7.7&quot;)&#10;&#10; // Use the Kotlin test library.&#10; testImplementation 'org.jetbrains.kotlin:kotlin-test'&#10;&#10; // Use the Kotlin JUnit integration.&#10; testImplementation 'org.jetbrains.kotlin:kotlin-test-junit'&#10; testImplementation &quot;org.jetbrains.kotlin:kotlin-test:2.2.21&quot;&#10;&#10; implementation 'com.github.kupietz:cistern:v1.0.4'&#10; implementation 'org.maltparser:maltparser:1.9.2'&#10; implementation 'org.apache.opennlp:opennlp-tools:2.5.6'&#10; implementation 'org.slf4j:slf4j-simple:2.0.17'&#10; implementation 'org.apache.ant:ant:1.10.15'&#10; implementation 'org.apache.commons:commons-compress:1.28.0'&#10;&#10;}&#10;&#10;// Erzwinge JDK 21 Toolchain und Bytecode-Level 21&#10;java {&#10; toolchain {&#10; languageVersion = JavaLanguageVersion.of(21)&#10; }&#10;}&#10;&#10;kotlin {&#10; jvmToolchain(21)&#10;}&#10;&#10;// Für evtl. vorhandenen Java-Quellcode&#10;tasks.withType(JavaCompile).configureEach {&#10; options.release = 21&#10;}&#10;&#10;tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach {&#10; kotlinOptions {&#10; jvmTarget = &quot;21&quot;&#10; // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release&#10; // freeCompilerArgs += [&quot;-Xjdk-release=21&quot;]&#10; }&#10;}&#10;&#10;application {&#10; // Define the main class for the application.&#10; mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt'&#10;}&#10;&#10;jar {&#10; // Will include every single one of your dependencies, project or not&#10; // def lowerCasedName = baseName.toLowerCase()&#10; // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)&#10;&#10; manifest.attributes(&#10; 'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),&#10; 'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;,&#10; 'Implementation-Title': rootProject.name,&#10; 'Implementation-Version': project.version&#10; )&#10; shadowJar {&#10; archiveBaseName.set('korapxmltool')&#10; archiveClassifier.set('')&#10; // Version ins Dateinamen aufnehmen&#10; archiveVersion.set(project.version.toString())&#10; manifest.attributes(&#10; 'Main-Class': &quot;de.ids_mannheim.korapxmltools.KorapXmlToolKt&quot;,&#10; 'Implementation-Title': rootProject.name,&#10; 'Implementation-Version': project.version&#10; )&#10; }&#10;}&#10;&#10;&#10;configurations {&#10; runtimeLib.extendsFrom implementation&#10;}" />
</PendingDiffInfo>
</value>
</entry>
<entry key="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt">
<value>
<PendingDiffInfo>
<option name="filePath" value="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt" />
<option name="originalContent" value="package de.ids_mannheim.korapxmltools&#10;&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries&#10;import org.apache.commons.compress.archivers.zip.Zip64Mode&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveEntry&#10;import org.w3c.dom.Document&#10;import org.w3c.dom.Element&#10;import org.w3c.dom.NodeList&#10;import org.xml.sax.InputSource&#10;import org.xml.sax.SAXParseException&#10;import picocli.CommandLine&#10;import picocli.CommandLine.*&#10;import java.io.File&#10;import java.io.FileOutputStream&#10;import java.io.InputStream&#10;import java.io.StringWriter&#10;import java.lang.Integer.parseInt&#10;import java.util.*&#10;import java.util.concurrent.Callable&#10;import java.util.concurrent.ConcurrentHashMap&#10;import java.util.concurrent.Executors&#10;import java.util.concurrent.atomic.AtomicLong&#10;import java.util.logging.ConsoleHandler&#10;import java.util.logging.Level&#10;import java.util.logging.LogManager&#10;import java.util.logging.Logger&#10;import java.util.regex.Matcher&#10;import java.util.regex.Pattern&#10;import java.util.stream.IntStream&#10;import java.util.zip.ZipEntry&#10;&#10;import java.util.zip.ZipFile&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream&#10;import javax.xml.parsers.DocumentBuilder&#10;import javax.xml.parsers.DocumentBuilderFactory&#10;import javax.xml.transform.OutputKeys&#10;import javax.xml.transform.TransformerFactory&#10;import javax.xml.transform.dom.DOMSource&#10;import javax.xml.transform.stream.StreamResult&#10;import kotlin.math.min&#10;import kotlin.system.exitProcess&#10;&#10;val ZIP_ENTRY_UNIX_MODE = parseInt(&quot;644&quot;, 8)&#10;&#10;@Command(&#10; name = &quot;KorapXmlTool&quot;,&#10; mixinStandardHelpOptions = true,&#10; version = [&quot;KorapXmlTool 2.0-beta-01&quot;],&#10; description = [&quot;Converts KorAP-XML &lt;https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml&gt; base or &quot; +&#10; &quot;morpho zips to (annotated) CoNLL(-U) format with all information necessary for &quot; +&#10; &quot;reconstruction in comment lines.&quot;]&#10;)&#10;&#10;class KorapXmlTool : Callable&lt;Int&gt; {&#10; val COMPATIBILITY_MODE = System.getenv(&quot;COMPATIBILITY_MODE&quot;) != null&#10;&#10; @Spec lateinit var spec : Model.CommandSpec&#10;&#10; @Parameters(arity = &quot;1..*&quot;, description = [&quot;At least one zip file name&quot;])&#10; var zipFileNames: Array&lt;String&gt;? = null&#10;&#10; @Option(&#10; names = [&quot;-f&quot;, &quot;--output-format&quot;],&#10; description = [&quot;Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}&quot;,&#10; &quot;conllu: CoNLL-U format&quot;,&#10; &quot;korapxml, xml, zip: KorAP-XML format zip&quot;,&#10; &quot;word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines&quot;,&#10; &quot;now, NOW: NOW corpus export format: w2v-like format with &lt;p&gt; tags for sentence ends and @@&lt;text-sigle&gt; prefix&quot;,&#10; ],&#10; converter = [OutputFormatConverter::class]&#10; )&#10; var outputFormat: OutputFormat = OutputFormat.CONLLU&#10; class OutputFormatConverter : ITypeConverter&lt;OutputFormat&gt; {&#10; override fun convert(value: String?): OutputFormat {&#10; return when (value?.lowercase(Locale.getDefault())) {&#10; &quot;conllu&quot;, &quot;conll&quot; -&gt; OutputFormat.CONLLU&#10; &quot;word2vec&quot;, &quot;w2v&quot; -&gt; OutputFormat.WORD2VEC&#10; &quot;korapxml&quot;, &quot;korap&quot;, &quot;xml&quot;, &quot;zip&quot; -&gt; OutputFormat.KORAPXML&#10; &quot;now&quot;, &quot;NOW&quot; -&gt; OutputFormat.NOW&#10; else -&gt; throw IllegalArgumentException(&quot;Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(&quot;, &quot;) { it.name }}&quot;)&#10; }&#10; }&#10; }&#10;&#10; @Option(&#10; names = [&quot;--sigle-pattern&quot;, &quot;-p&quot;],&#10; paramLabel = &quot;PATTERN&quot;,&#10; description = [&quot;Extract only documents with sigle matching the pattern (regex)&quot;]&#10; )&#10; var siglePattern: String? = null&#10;&#10; @Option(&#10; names = [&quot;--extract-attributes-regex&quot;, &quot;-e&quot;],&#10; paramLabel = &quot;REGEX&quot;,&#10; description = [&quot;Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.&quot;,&#10; &quot;Example: -e '(posting/id|div/id)'&quot;]&#10; )&#10; var extractAttributesRegex: String = &quot;&quot;&#10;&#10; @Option(&#10; names = [&quot;--s-bounds-from-morpho&quot;], description = [&quot;Not yet implemented: s bounds from morpho&quot;]&#10; )&#10; var sBoundsFromMorpho: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--log&quot;, &quot;-l&quot;],&#10; paramLabel = &quot;LEVEL&quot;,&#10; description = [&quot;Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${&quot;$&quot;}{DEFAULT-VALUE}])&quot;]&#10; )&#10; var logLevel: String = &quot;WARNING&quot;&#10;&#10; @Option(&#10; names = [&quot;--columns&quot;, &quot;-c&quot;],&#10; paramLabel = &quot;NUMBER&quot;,&#10; description = [&quot;Number of columns. 1 means just the token. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;, &quot;Possible values: 1-10&quot;]&#10; )&#10; var columns: Int = 10&#10;&#10; @Option(&#10; names = [&quot;--word2vec&quot;, &quot;-w&quot;],&#10; description = [&quot;Print text in LM training format: tokens separated by space, sentences separated by newline&quot;,&#10; &quot;Deprecated: use -f word2vec&quot;]&#10; )&#10; fun setWord2Vec(word2vec: Boolean) {&#10; if (word2vec) {&#10; outputFormat = OutputFormat.WORD2VEC&#10; }&#10; }&#10;&#10; @Option(&#10; names = [&quot;--exclude-zip-glob&quot;],&#10; paramLabel = &quot;GLOB&quot;,&#10; description = [&#10; &quot;Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').&quot;,&#10; &quot;May be repeated. Applied to basenames, not full paths.&quot;&#10; ]&#10; )&#10; var excludeZipGlobs: MutableList&lt;String&gt; = mutableListOf()&#10;&#10; @Option(&#10; names = [&quot;--token-separator&quot;, &quot;-s&quot;],&#10; paramLabel = &quot;STRING&quot;,&#10; defaultValue = &quot;\n&quot;,&#10; description = [&quot;Token separator. Default: new-line for CoNLL-U, space for word2vec format.&quot;]&#10; )&#10; var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) &quot; &quot; else &quot;\n&quot;&#10;&#10; @Option(names = [&quot;--offsets&quot;], description = [&quot;Not yet implemented: offsets&quot;])&#10; var offsets: Boolean = false&#10;&#10; @Option(names = [&quot;--comments&quot;, &quot;-C&quot;], description = [&quot;Not yet implemented: comments&quot;])&#10; var comments: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--extract-metadata-regex&quot;, &quot;-m&quot;],&#10; paramLabel = &quot;REGEX&quot;,&#10; description = [&quot;Extract metadata regexes.\nExample: -m '&lt;textSigle&gt;([^&lt;]+)' -m '&lt;creatDate&gt;([^&lt;]+)'&quot;]&#10; )&#10; var extractMetadataRegex: MutableList&lt;String&gt; = mutableListOf()&#10;&#10; @Option(&#10; names = [&quot;--annotate-with&quot;, &quot;-A&quot;],&#10; paramLabel = &quot;COMMAND&quot;,&#10; description = [&quot;Pipe output through command&quot;]&#10; )&#10; var annotateWith: String = &quot;&quot;&#10;&#10; @Option(&#10; names = [&quot;--threads&quot;, &quot;-T&quot;],&#10; paramLabel = &quot;THREADS&quot;,&#10; description = [&quot;Maximum number of threads to use. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;]&#10; )&#10; var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2&#10; fun setThreads(threads: Int) {&#10; if (threads &lt; 1) {&#10; throw ParameterException(spec.commandLine(), String.format(&quot;Invalid value `%d' for option '--threads': must be at least 1&quot;, threads))&#10; }&#10; this.maxThreads = threads&#10; System.setProperty(&quot;java.util.concurrent.ForkJoinPool.common.parallelism&quot;, threads.toString())&#10; }&#10;&#10; @Option(&#10; names = [&quot;--zip-parallelism&quot;],&#10; paramLabel = &quot;N&quot;,&#10; description = [&quot;Maximum number of zip files to process concurrently. Defaults to --threads.&quot;]&#10; )&#10; var zipParallelism: Int? = null&#10;&#10; @Option(&#10; names = [&quot;--sequential&quot;],&#10; description = [&#10; &quot;Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now).&quot;&#10; ]&#10; )&#10; var sequentialInZip: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--overwrite&quot;, &quot;-o&quot;],&#10; description = [&quot;Overwrite existing files&quot;]&#10; )&#10; var overwrite: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--mem-stats-interval&quot;],&#10; paramLabel = &quot;N&quot;,&#10; description = [&quot;Log memory and cache statistics every N processed documents (0 disables; default: 0)&quot;]&#10; )&#10; var memStatsInterval: Int = 0&#10;&#10; @Option(&#10; names = [&quot;--lemma&quot;],&#10; description = [&quot;In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)&quot;]&#10; )&#10; var useLemma: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--lemma-only&quot;],&#10; description = [&#10; &quot;Do not load texts from data.xml and output only lemmas (requires morpho.xml).&quot;,&#10; &quot;Only valid with -f word2vec or -f now; implies --lemma.&quot;&#10; ]&#10; )&#10; var lemmaOnly: Boolean = false&#10;&#10; private var taggerName: String? = null&#10; private var taggerModel: String? = null&#10; @Option(&#10; names = [&quot;--tag-with&quot;, &quot;-t&quot;],&#10; paramLabel = &quot;TAGGER:MODEL&quot;,&#10; description = [&quot;Specify a tagger and a model: ${taggerFoundries}:&lt;path/to/model&gt;.&quot;]&#10; )&#10; fun setTagWith(tagWith: String) {&#10; val pattern: Pattern = Pattern.compile(&quot;(${taggerFoundries}):(.+)&quot;)&#10; val matcher: Matcher = pattern.matcher(tagWith)&#10; if (!matcher.matches()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value `%s' for option '--tag-with': &quot;+&#10; &quot;value does not match the expected pattern ${taggerFoundries}:&lt;path/to/model&gt;&quot;, tagWith))&#10; } else {&#10; taggerName = matcher.group(1)&#10; taggerModel = matcher.group(2)&#10; if (!File(taggerModel).exists()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value for option '--tag-with':&quot;+&#10; &quot;model file '%s' does not exist&quot;, taggerModel, taggerModel))&#10; }&#10; }&#10; }&#10;&#10; private var parserName: String? = null&#10; private var parserModel: String? = null&#10; @Option(&#10; names = [&quot;--parse-with&quot;, &quot;-P&quot;],&#10; paramLabel = &quot;parser:MODEL&quot;,&#10; description = [&quot;Specify a parser and a model: ${parserFoundries}:&lt;path/to/model&gt;.&quot;]&#10; )&#10; fun setParseWith(parseWith: String) {&#10; val pattern: Pattern = Pattern.compile(&quot;(${parserFoundries}):(.+)&quot;)&#10; val matcher: Matcher = pattern.matcher(parseWith)&#10; if (!matcher.matches()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value `%s' for option '--parse-with': &quot;+&#10; &quot;value does not match the expected pattern (${parserFoundries}):&lt;path/to/model&gt;&quot;, parseWith))&#10; } else {&#10; parserName = matcher.group(1)&#10; parserModel = matcher.group(2)&#10; if (!File(parserModel).exists()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value for option '--parse-with':&quot;+&#10; &quot;model file '%s' does not exist&quot;, parserModel, parserModel))&#10; }&#10; }&#10; }&#10;&#10;&#10; override fun call(): Int {&#10; val handler = ConsoleHandler()&#10; LogManager.getLogManager().reset()&#10; handler.formatter = ColoredFormatter()&#10;&#10; for (handler in LOGGER.handlers) {&#10; LOGGER.removeHandler(handler)&#10; }&#10; LOGGER.addHandler(handler)&#10; LOGGER.level = try {&#10; Level.parse(logLevel.uppercase(Locale.getDefault()))&#10; } catch (e: IllegalArgumentException) {&#10; LOGGER.warning(&quot;Invalid log level: $logLevel. Defaulting to WARNING.&quot;)&#10; Level.WARNING&#10; }&#10;&#10; if (lemmaOnly) {&#10; useLemma = true&#10; if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10; throw ParameterException(spec.commandLine(), &quot;--lemma-only is supported only with -f word2vec or -f now&quot;)&#10; }&#10; }&#10;&#10; LOGGER.info(&quot;Processing zip files: &quot; + zipFileNames!!.joinToString(&quot;, &quot;))&#10;&#10; korapxml2conllu(zipFileNames!!)&#10; return 0&#10; }&#10;&#10; private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name)&#10;&#10; private var annotationWorkerPool : AnnotationWorkerPool? = null&#10; // Shared executor for entry-level parallelism across all zips&#10; private var entryExecutor: java.util.concurrent.ExecutorService? = null&#10;&#10; val texts: ConcurrentHashMap&lt;String, NonBmpString&gt; = ConcurrentHashMap()&#10; val sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10; val tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10; val morpho: ConcurrentHashMap&lt;String, MutableMap&lt;String, MorphoSpan&gt;&gt; = ConcurrentHashMap()&#10; val fnames: ConcurrentHashMap&lt;String, String&gt; = ConcurrentHashMap()&#10; val metadata: ConcurrentHashMap&lt;String, Array&lt;String&gt;&gt; = ConcurrentHashMap()&#10; val extraFeatures: ConcurrentHashMap&lt;String, MutableMap&lt;String, String&gt;&gt; = ConcurrentHashMap()&#10; private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0)&#10; var taggerToolBridges: ConcurrentHashMap&lt;Long, TaggerToolBridge?&gt; = ConcurrentHashMap()&#10; var parserToolBridges: ConcurrentHashMap&lt;Long, ParserToolBridge?&gt; = ConcurrentHashMap()&#10;&#10; // Zip progress tracking for logging (zipNumber/zipTotal)&#10; private val zipOrdinals: ConcurrentHashMap&lt;String, Int&gt; = ConcurrentHashMap()&#10; private var totalZips: Int = 0&#10; private val zipSizes: ConcurrentHashMap&lt;String, Long&gt; = ConcurrentHashMap()&#10; private val processedZipBytes: AtomicLong = AtomicLong(0)&#10; private var totalZipBytes: Long = 0&#10; private var startTimeMillis: Long = 0&#10;&#10; var dbFactory: DocumentBuilderFactory? = null&#10; var dBuilder: DocumentBuilder? = null&#10; var morphoZipOutputStream: ZipArchiveOutputStream? = null&#10;&#10; fun String.hasCorrespondingBaseZip(): Boolean {&#10; if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return false&#10; val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10; return File(baseZip).exists()&#10; }&#10;&#10; fun String.correspondingBaseZip(): String? {&#10; if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return null&#10; val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10; return if (File(baseZip).exists()) baseZip else null&#10; }&#10;&#10; fun korapxml2conllu(args: Array&lt;String&gt;) {&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotateWith.isNotEmpty()) {&#10; LOGGER.severe(&quot;Shell command annotation is not yet supported with output format $outputFormat&quot;)&#10; exitProcess(1)&#10; }&#10; // Initialize shared entry executor (used inside each zip)&#10; entryExecutor = Executors.newFixedThreadPool(maxThreads)&#10;&#10; if (annotateWith.isNotEmpty()) {&#10; annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)&#10; }&#10;&#10; var zips: Array&lt;String&gt; = args&#10; if (excludeZipGlobs.isNotEmpty()) {&#10; val before = zips.size&#10; val patterns = excludeZipGlobs.map { globToRegex(it) }&#10; zips = zips.filter { zipPath -&gt;&#10; val base = File(zipPath).name&#10; patterns.none { rx -&gt; rx.matches(base) }&#10; }.toTypedArray()&#10; val excluded = before - zips.size&#10; if (excluded &gt; 0) {&#10; LOGGER.info(&quot;Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(&quot;, &quot;)}&quot;)&#10; }&#10; }&#10; // Initialize zip progress tracking and sizes&#10; startTimeMillis = System.currentTimeMillis()&#10; processedZipBytes.set(0)&#10; totalZips = zips.size&#10; zipOrdinals.clear()&#10; zipSizes.clear()&#10; zips.forEach { zip -&gt; zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } }&#10; totalZipBytes = zipSizes.values.sum()&#10; // In lemma-only mode, process largest zips first&#10; if (lemmaOnly) {&#10; zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray()&#10; }&#10; zips.forEachIndexed { index, zip -&gt; zipOrdinals[zip] = index + 1 }&#10;&#10; // Log zip order with sizes so the user can verify sorting&#10; val totalHuman = humanBytes(totalZipBytes)&#10; LOGGER.info(&quot;Zip processing order (${zips.size} file(s), total ${totalHuman}):&quot;)&#10; zips.forEachIndexed { idx, zip -&gt;&#10; val size = zipSizes[zip] ?: 0L&#10; LOGGER.info(String.format(Locale.ROOT, &quot;%d/%d: %s (%s)&quot;, idx + 1, zips.size, zip, humanBytes(size)))&#10; }&#10;&#10; if (sequentialInZip) {&#10; if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10; throw ParameterException(spec.commandLine(), &quot;--sequential is supported only with -f word2vec or -f now&quot;)&#10; }&#10; }&#10;&#10; if (maxThreads &gt; 1) {&#10; val foundry = getFoundryFromZipFileNames(zips)&#10; val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)&#10; LOGGER.info(&quot;Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) &quot;sequential&quot; else &quot;parallel&quot;}&quot;)&#10; processZipsWithQueue(zips, foundry, parallelism)&#10; } else {&#10; LOGGER.info(&quot;Processing zip files sequentially&quot;)&#10; Arrays.stream(zips).forEachOrdered { zipFilePath -&gt;&#10; processZipFileSequentially((zipFilePath ?: &quot;&quot;).toString(), getFoundryFromZipFileNames(zips))&#10; }&#10; }&#10;&#10; if (annotationWorkerPool != null) {&#10; LOGGER.info(&quot;closing worker pool&quot;)&#10; annotationWorkerPool?.close()&#10; }&#10; // Shutdown entry executor&#10; entryExecutor?.shutdown()&#10; }&#10;&#10; private fun processZipsWithQueue(zips: Array&lt;String&gt;, foundry: String, parallelism: Int) {&#10; val queue: java.util.concurrent.BlockingQueue&lt;String&gt; = java.util.concurrent.LinkedBlockingQueue()&#10; zips.forEach { queue.put(it) }&#10; val executor = Executors.newFixedThreadPool(parallelism)&#10; val active = java.util.concurrent.atomic.AtomicInteger(0)&#10; repeat(parallelism) {&#10; executor.submit {&#10; active.incrementAndGet()&#10; try {&#10; while (true) {&#10; val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS)&#10; if (zipPath == null) {&#10; if (queue.isEmpty()) break else continue&#10; }&#10; if (sequentialInZip) {&#10; processZipFileSequentially(zipPath, foundry)&#10; } else {&#10; processZipFile(zipPath, foundry)&#10; }&#10; }&#10; } finally {&#10; active.decrementAndGet()&#10; }&#10; }&#10; }&#10; executor.shutdown()&#10; try {&#10; executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS)&#10; } catch (ie: InterruptedException) {&#10; Thread.currentThread().interrupt()&#10; }&#10; }&#10;&#10; // Convert a shell-like glob to a Regex: '*' -&gt; &quot;.*&quot;, '?' -&gt; '.', anchored full match&#10; private fun globToRegex(glob: String): Regex {&#10; val sb = StringBuilder(&quot;^&quot;)&#10; glob.forEach { ch -&gt;&#10; when (ch) {&#10; '*' -&gt; sb.append(&quot;.*&quot;)&#10; '?' -&gt; sb.append('.')&#10; '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -&gt; sb.append('\\').append(ch)&#10; else -&gt; sb.append(ch)&#10; }&#10; }&#10; sb.append('$')&#10; return Regex(sb.toString())&#10; }&#10;&#10;&#10; private fun getTokenSpansFromMorho(morpho: MutableMap&lt;String, MorphoSpan&gt;): Array&lt;Span&gt; {&#10; return morpho.keys.map { key -&gt;&#10; val fromTo = key.split(&quot;-&quot;)&#10; Span(fromTo[0].toInt(), fromTo[1].toInt())&#10; }.sortedBy {&#10; it.from&#10; }.toTypedArray()&#10; }&#10;&#10; private fun getFoundryFromZipFileName(zipFileName: String): String {&#10; if (!zipFileName.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) {&#10; return &quot;base&quot;&#10; }&#10; return zipFileName.replace(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;), &quot;$1&quot;)&#10; }&#10;&#10; private fun getFoundryFromZipFileNames(zipFileNames: Array&lt;String&gt;): String {&#10; for (zipFileName in zipFileNames) {&#10; val foundry = getFoundryFromZipFileName(zipFileName)&#10; if (foundry != &quot;base&quot;) {&#10; return foundry&#10; }&#10; }&#10; return &quot;base&quot;&#10; }&#10;&#10; private fun processZipFile(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10; LOGGER.info(&quot;Foundry: $foundry $dbFactory&quot;)&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; dbFactory == null) {&#10; var targetFoundry = &quot;base&quot;&#10; if (taggerName != null) {&#10; val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10; if (tagger != null) {&#10; targetFoundry = tagger.foundry&#10; }&#10; } else if (parserName != null) {&#10; targetFoundry = parserName!!&#10; }&#10; dbFactory = DocumentBuilderFactory.newInstance()&#10; dBuilder = dbFactory!!.newDocumentBuilder()&#10; val outputMorphoZipFileName =&#10; if (parserName != null)&#10; zipFilePath.replace(Regex(&quot;(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$&quot;), &quot;.&quot;.plus(parserName).plus(&quot;.zip&quot;))&#10; else&#10; zipFilePath.replace(Regex(&quot;\\.zip$&quot;), &quot;.&quot;.plus(targetFoundry).plus(&quot;.zip&quot;))&#10; if (File(outputMorphoZipFileName).exists() &amp;&amp; !overwrite) {&#10; LOGGER.severe(&quot;Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.&quot;)&#10; exitProcess(1)&#10; }&#10; val fileOutputStream = FileOutputStream(outputMorphoZipFileName)&#10; morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply {&#10; setUseZip64(Zip64Mode.Always)&#10; }&#10; }&#10; if (zipFilePath.hasCorrespondingBaseZip()) {&#10; val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10; // Process related zips one after another to keep the ZipFile lifetime strictly bounded&#10; relatedZips.forEach { zip -&gt;&#10; ZipFile(zip).use { zipFile -&gt;&#10; processZipEntriesWithPool(zipFile, foundry, true)&#10; }&#10; }&#10; } else {&#10; ZipFile(zipFilePath).use { zipFile -&gt;&#10; processZipEntriesWithPool(zipFile, foundry, false)&#10; }&#10; }&#10; if (outputFormat == OutputFormat.KORAPXML) {&#10; morphoZipOutputStream!!.close()&#10; }&#10; logZipProgress(zipFilePath)&#10; }&#10;&#10; private fun processZipFileSequentially(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10; if (zipFilePath.hasCorrespondingBaseZip()) {&#10; // Process the two related zips strictly sequentially to limit memory growth&#10; val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10; zips.forEach { zip -&gt;&#10; ZipFile(zip).use { zipFile -&gt;&#10; // Iterate entries in a deterministic order to keep related files close together&#10; zipFile.stream()&#10; .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10; .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10; .forEachOrdered { zipEntry -&gt;&#10; processZipEntry(zipFile, foundry, zipEntry, true)&#10; }&#10; }&#10; }&#10; } else {&#10; ZipFile(zipFilePath).use { zipFile -&gt;&#10; zipFile.stream()&#10; .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10; .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10; .forEachOrdered { zipEntry -&gt;&#10; processZipEntry(zipFile, foundry, zipEntry, false)&#10; }&#10; }&#10; }&#10; logZipProgress(zipFilePath)&#10; }&#10;&#10; private fun logZipProgress(zipFilePath: String) {&#10; try {&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; val done = processedZipBytes.addAndGet(size)&#10; val total = if (totalZipBytes &gt; 0) totalZipBytes else 1L&#10; val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1)&#10; val speedBytesPerSec = (done * 1000.0) / elapsedMs&#10; val remaining = (total - done).coerceAtLeast(0)&#10; val etaSeconds = if (speedBytesPerSec &gt; 0.0) (remaining / speedBytesPerSec).toLong() else -1L&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val pct = (done * 100.0 / total).coerceIn(0.0, 100.0)&#10; val humanSpeed = String.format(Locale.ROOT, &quot;%.2f MB/s&quot;, speedBytesPerSec / (1024.0 * 1024.0))&#10; val etaStr = if (etaSeconds &gt;= 0) formatDuration(etaSeconds) else &quot;unknown&quot;&#10; LOGGER.info(&#10; &quot;Finished zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} &quot; +&#10; &quot;(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, &quot;%.1f&quot;, pct)}%%, &quot; +&#10; &quot;ETA ${etaStr} at ${humanSpeed}&quot;&#10; )&#10; } catch (e: Exception) {&#10; LOGGER.fine(&quot;Failed to log zip progress for $zipFilePath: ${e.message}&quot;)&#10; }&#10; }&#10;&#10; private fun humanBytes(bytes: Long): String {&#10; if (bytes &lt; 1024) return &quot;$bytes B&quot;&#10; val kb = bytes / 1024.0&#10; if (kb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f KB&quot;, kb)&#10; val mb = kb / 1024.0&#10; if (mb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f MB&quot;, mb)&#10; val gb = mb / 1024.0&#10; return String.format(Locale.ROOT, &quot;%.1f GB&quot;, gb)&#10; }&#10;&#10; private fun formatDuration(seconds: Long): String {&#10; var s = seconds&#10; val h = s / 3600; s %= 3600&#10; val m = s / 60; val sec = s % 60&#10; return String.format(Locale.ROOT, &quot;%02d:%02d:%02d&quot;, h, m, sec)&#10; }&#10;&#10; private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) {&#10; // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested&#10; val entries: MutableList&lt;ZipEntry&gt; = ArrayList()&#10; val enumEntries = zipFile.entries()&#10; while (enumEntries.hasMoreElements()) {&#10; val e = enumEntries.nextElement()&#10; if (extractMetadataRegex.isEmpty() &amp;&amp; e.name.contains(&quot;header.xml&quot;)) continue&#10; entries.add(e)&#10; }&#10; if (entries.isEmpty()) return&#10;&#10; // If only one thread requested, do sequential to avoid pool overhead&#10; if (maxThreads &lt;= 1) {&#10; entries.forEach { entry -&gt; processZipEntry(zipFile, foundry, entry, waitForMorpho) }&#10; return&#10; }&#10;&#10; // Submit all entry tasks to the shared executor and await completion before closing the zip&#10; val latch = java.util.concurrent.CountDownLatch(entries.size)&#10; entries.forEach { entry -&gt;&#10; entryExecutor?.execute {&#10; try {&#10; processZipEntry(zipFile, foundry, entry, waitForMorpho)&#10; } catch (t: Throwable) {&#10; LOGGER.warning(&quot;Failed to process entry ${entry.name}: ${t.message}&quot;)&#10; } finally {&#10; latch.countDown()&#10; }&#10; }&#10; }&#10; try {&#10; latch.await()&#10; } catch (ie: InterruptedException) {&#10; Thread.currentThread().interrupt()&#10; }&#10; }&#10;&#10; fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {&#10; var foundry = _foundry&#10; var waitForMorpho = passedWaitForMorpho&#10; LOGGER.finer(&quot;Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}&quot;)&#10; if (taggerName != null &amp;&amp; !taggerToolBridges.containsKey(Thread.currentThread().threadId())) {&#10; val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10; taggerToolBridges[Thread.currentThread().threadId()] = tagger&#10; if (tagger != null) {&#10; foundry = tagger.foundry&#10; }&#10;&#10; }&#10; if (parserName != null &amp;&amp; !parserToolBridges.containsKey(Thread.currentThread().threadId())) {&#10; val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?&#10; parserToolBridges[Thread.currentThread().threadId()] = parser&#10; if (parser != null) {&#10; foundry = &quot;$foundry dependency:${parser.foundry}&quot;&#10; LOGGER.fine(&quot;Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}&quot;)&#10; }&#10; }&#10;&#10; try {&#10; if (zipEntry.name.matches(Regex(&quot;.*(data|tokens|structure|morpho)\\.xml$&quot;))) {&#10; // Ensure the entry stream and reader are closed to avoid native memory buildup&#10; val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()&#10; val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()&#10; // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure&#10; if (lemmaOnly &amp;&amp; zipEntry.name.endsWith(&quot;data.xml&quot;)) {&#10; return&#10; }&#10; val doc: Document = try {&#10; zipFile.getInputStream(zipEntry).use { inputStream -&gt;&#10; XMLCommentFilterReader(inputStream, &quot;UTF-8&quot;).use { reader -&gt;&#10; dBuilder.parse(InputSource(reader))&#10; }&#10; }&#10; } catch (e: SAXParseException) {&#10; LOGGER.warning(&quot;Error parsing file: &quot; + zipEntry.name + &quot; &quot; + e.message)&#10; return&#10; }&#10;&#10; doc.documentElement.normalize()&#10; val docId: String = doc.documentElement.getAttribute(&quot;docid&quot;)&#10; if (siglePattern != null &amp;&amp; !Regex(siglePattern!!).containsMatchIn(docId)) {&#10; return&#10; }&#10; // LOGGER.info(&quot;Processing file: &quot; + zipEntry.getName())&#10; val fileName = zipEntry.name.replace(Regex(&quot;.*?/([^/]+\\.xml)$&quot;), &quot;$1&quot;)&#10; when (fileName) {&#10; &quot;data.xml&quot; -&gt; {&#10; if (!lemmaOnly) {&#10; val textsList: NodeList = doc.getElementsByTagName(&quot;text&quot;)&#10; if (textsList.length &gt; 0) {&#10; texts[docId] = NonBmpString(textsList.item(0).textContent)&#10; }&#10; }&#10; }&#10;&#10; &quot;structure.xml&quot; -&gt; {&#10; val spans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; if (extractAttributesRegex.isNotEmpty())&#10; extraFeatures[docId] = extractMiscSpans(spans)&#10; sentences[docId] = extractSentenceSpans(spans)&#10;&#10; }&#10;&#10; &quot;tokens.xml&quot; -&gt; {&#10; if (!fnames.contains(docId)) {&#10; fnames[docId] = zipEntry.name&#10; }&#10; val tokenSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; tokens[docId] = extractSpans(tokenSpans)&#10; }&#10;&#10; &quot;morpho.xml&quot; -&gt; {&#10; waitForMorpho = true&#10; fnames[docId] = zipEntry.name&#10; val fsSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; morpho[docId] = extractMorphoSpans(fsSpans)&#10; tokens[docId] = extractSpans(fsSpans)&#10; }&#10; }&#10;&#10; val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10; // For lemma-only/lemma-based word2vec/now, we can proceed without full text&#10; val textRequired = when (outputFormat) {&#10; OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10; else -&gt; true&#10; }&#10; if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10; &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10; &amp;&amp; (extractMetadataRegex.isEmpty() || metadata[docId] != null)&#10; ) {&#10; // Be quiet on INFO; per-text logs only on FINE and below&#10; LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; processText(docId, foundry)&#10; }&#10; } else if (extractMetadataRegex.isNotEmpty() &amp;&amp; zipEntry.name.matches(Regex(&quot;.*/header\\.xml$&quot;))) {&#10; //LOGGER.info(&quot;Processing header file: &quot; + zipEntry.name)&#10; val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }&#10; val docId =&#10; Regex(&quot;&lt;textSigle&gt;([^&lt;]+)&lt;/textSigle&gt;&quot;).find(text)?.destructured?.component1()&#10; ?.replace(Regex(&quot;/&quot;), &quot;_&quot;)&#10; LOGGER.fine(&quot;Processing header file: &quot; + zipEntry.name + &quot; docId: &quot; + docId)&#10; val meta = ArrayList&lt;String&gt;()&#10; extractMetadataRegex.forEach { regex -&gt;&#10; val match = Regex(regex).find(text)&#10; if (match != null) {&#10; meta.add(match.destructured.component1())&#10; }&#10; }&#10; if (meta.isNotEmpty() &amp;&amp; docId != null) {&#10; metadata[docId] = meta.toTypedArray()&#10; val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10; val textRequired = when (outputFormat) {&#10; OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10; else -&gt; true&#10; }&#10; if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10; &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10; ) {&#10; // Be quiet on INFO; per-text logs only on FINE and below&#10; LOGGER.fine(&quot;Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; processText(docId, foundry)&#10; }&#10; }&#10; }&#10; } catch (e: Exception) {&#10; e.printStackTrace()&#10; }&#10; }&#10;&#10; private fun processText(&#10; docId: String,&#10; foundry: String,&#10; ) {&#10; LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; var morphoFoundry = getMorphoFoundry()&#10; val output =&#10; if (outputFormat == OutputFormat.WORD2VEC) {&#10; lmTrainingOutput(docId)&#10; } else if (outputFormat == OutputFormat.NOW) {&#10; nowOutput(docId)&#10; } else {&#10; if (taggerToolBridges[Thread.currentThread().threadId()] != null) {&#10; morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText(&#10; tokens[docId]!!,&#10; sentences[docId],&#10; texts[docId]!!&#10; )&#10;&#10; }&#10; if (parserToolBridges[Thread.currentThread().threadId()] != null) {&#10; if (morpho[docId] == null) {&#10; LOGGER.severe(&quot;No morpho data for $docId&quot;)&#10; //exitProcess(1)&#10; }&#10; LOGGER.finer(&quot;Parsing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText(&#10; tokens[docId]!!,&#10; morpho[docId],&#10; sentences[docId],&#10; texts[docId]!!&#10; )&#10; LOGGER.finer(&quot;Parsed text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; }&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotationWorkerPool == null) {&#10; korapXmlOutput(getMorphoFoundry(), docId)&#10; } else {&#10; conlluOutput(foundry, docId)&#10; }&#10; }&#10;&#10; if (annotationWorkerPool != null) {&#10; annotationWorkerPool?.pushToQueue(output.append(&quot;\n# eot\n&quot;).toString())&#10; // Release internal char[] early&#10; output.setLength(0)&#10; } else if (outputFormat != OutputFormat.KORAPXML) {&#10; synchronized(System.out) {&#10; println(output.toString())&#10; }&#10; // Release internal char[] early&#10; output.setLength(0)&#10; } else {&#10; korapXmlOutput(foundry, docId)&#10; }&#10;&#10;&#10; arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -&gt;&#10; if (map === morpho) {&#10; // Clear inner map to release references early&#10; morpho[docId]?.clear()&#10; }&#10; map.remove(docId)&#10; }&#10;&#10; // Periodic GC hint after processing many docs (lightweight safeguard)&#10; if ((processedDocs.incrementAndGet() % 2000) == 0) {&#10; LOGGER.fine(&quot;Processed ${processedDocs.get()} docs – requesting GC hint&quot;)&#10; System.gc()&#10; }&#10; // Memory / cache statistics logging&#10; if (memStatsInterval &gt; 0) {&#10; val count = processedDocs.get()&#10; if (count % memStatsInterval == 0) {&#10; logMemoryStats(count)&#10; }&#10; }&#10;&#10; if (outputFormat == OutputFormat.KORAPXML) {&#10; val entryPath = if (parserName != null) docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$parserName/&quot;).plus(&quot;dependency.xml&quot;)&#10; else&#10; docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$morphoFoundry/&quot;).plus(&quot;morpho.xml&quot;)&#10; val zipEntry = ZipArchiveEntry(entryPath)&#10; zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE&#10; synchronized(morphoZipOutputStream!!) {&#10; morphoZipOutputStream!!.putArchiveEntry(zipEntry)&#10; morphoZipOutputStream!!.write(output.toString().toByteArray())&#10; morphoZipOutputStream!!.closeArchiveEntry()&#10; }&#10; output.clear()&#10; }&#10; }&#10;&#10; private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: &quot;base&quot;&#10;&#10; private fun logMemoryStats(count: Int) {&#10; try {&#10; val rt = Runtime.getRuntime()&#10; val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024)&#10; val total = rt.totalMemory() / (1024 * 1024)&#10; val max = rt.maxMemory() / (1024 * 1024)&#10; LOGGER.info(&#10; &quot;MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} &quot; +&#10; &quot;maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}&quot;&#10; )&#10; } catch (e: Exception) {&#10; LOGGER.warning(&quot;Failed to log memory stats: ${e.message}&quot;)&#10; }&#10; }&#10;&#10; private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {&#10; val doc: Document = dBuilder!!.newDocument()&#10;&#10; // Root element&#10; val layer = doc.createElement(&quot;layer&quot;)&#10; layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10; layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10; layer.setAttribute(&quot;docid&quot;, docId)&#10; doc.appendChild(layer)&#10;&#10; val spanList = doc.createElement(&quot;spanList&quot;)&#10; layer.appendChild(spanList)&#10;&#10; var i = 0&#10; var s = 0&#10; var n = 0&#10; val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split(&quot;-&quot;)[0].toInt() }&#10;&#10; sortedKeys?.forEach { spanString -&gt;&#10; val mfs = morpho[docId]?.get(spanString)&#10; val offsets = spanString.split(&quot;-&quot;)&#10; if(offsets.size != 2) {&#10; LOGGER.warning(&quot;Invalid span: $spanString in $docId&quot;)&#10; return@forEach&#10; }&#10; if (offsets[0].toInt() &gt; sentences[docId]!!.elementAt(s).to) {&#10; s++&#10; n = i&#10; }&#10; i++&#10; if (mfs!!.deprel == &quot;_&quot;) {&#10; return@forEach&#10; }&#10;&#10; val spanNode = doc.createElement(&quot;span&quot;)&#10; spanNode.setAttribute(&quot;id&quot;, &quot;s${s + 1}_n${i - n}&quot;)&#10; spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10; spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10; // rel element&#10; val rel = doc.createElement(&quot;rel&quot;)&#10; rel.setAttribute(&quot;label&quot;, mfs.deprel)&#10;&#10; // inner span element&#10; val innerSpan = doc.createElement(&quot;span&quot;)&#10; val headInt = if(mfs.head == &quot;_&quot;) 0 else parseInt(mfs.head) - 1&#10; if (headInt &lt; 0) {&#10; innerSpan.setAttribute(&quot;from&quot;, sentences[docId]!!.elementAt(s).from.toString())&#10; innerSpan.setAttribute(&quot;to&quot;, sentences[docId]!!.elementAt(s).to.toString())&#10; } else {&#10; if (headInt + n &gt;= morpho[docId]!!.size) {&#10; LOGGER.warning(&quot;Head index out of bounds: ${headInt+n} &gt;= ${morpho[docId]!!.size} in $docId&quot;)&#10; return@forEach&#10; } else {&#10; val destSpanString = sortedKeys.elementAt(headInt + n)&#10; val destOffsets = destSpanString.split(&quot;-&quot;)&#10; innerSpan.setAttribute(&quot;from&quot;, destOffsets[0])&#10; innerSpan.setAttribute(&quot;to&quot;, destOffsets[1])&#10; }&#10; }&#10; rel.appendChild(innerSpan)&#10; spanNode.appendChild(rel)&#10; spanList.appendChild(spanNode)&#10; }&#10; val transformerFactory = TransformerFactory.newInstance()&#10; val transformer = transformerFactory.newTransformer()&#10; transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10; transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10; transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10; val domSource = DOMSource(doc)&#10; val streamResult = StreamResult(StringWriter())&#10; transformer.transform(domSource, streamResult)&#10;&#10; return StringBuilder(streamResult.writer.toString())&#10; }&#10;&#10; private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {&#10; return if (parserName != null) {&#10; korapXmlDependencyOutput(foundry, docId)&#10; } else {&#10; korapXmlMorphoOutput(foundry, docId)&#10; }&#10; }&#10;&#10; private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {&#10; val doc: Document = dBuilder!!.newDocument()&#10;&#10; // Root element&#10; val layer = doc.createElement(&quot;layer&quot;)&#10; layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10; layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10; layer.setAttribute(&quot;docid&quot;, docId)&#10; doc.appendChild(layer)&#10;&#10; val spanList = doc.createElement(&quot;spanList&quot;)&#10; layer.appendChild(spanList)&#10;&#10; var i = 0&#10; morpho[docId]?.forEach { (spanString, mfs) -&gt;&#10; i++&#10; val offsets = spanString.split(&quot;-&quot;)&#10; val spanNode = doc.createElement(&quot;span&quot;)&#10; spanNode.setAttribute(&quot;id&quot;, &quot;t_$i&quot;)&#10; spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10; spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10; // fs element&#10; val fs = doc.createElement(&quot;fs&quot;)&#10; fs.setAttribute(&quot;type&quot;, &quot;lex&quot;)&#10; fs.setAttribute(&quot;xmlns&quot;, &quot;http://www.tei-c.org/ns/1.0&quot;)&#10; spanNode.appendChild(fs)&#10; val f = doc.createElement(&quot;f&quot;)&#10; f.setAttribute(&quot;name&quot;, &quot;lex&quot;)&#10; fs.appendChild(f)&#10;&#10; // Inner fs element&#10; val innerFs = doc.createElement(&quot;fs&quot;)&#10; f.appendChild(innerFs)&#10;&#10; if (mfs.lemma != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;lemma&quot;)&#10; innerF.textContent = mfs.lemma&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.upos != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;upos&quot;)&#10; innerF.textContent = mfs.upos&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.xpos != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;pos&quot;)&#10; innerF.textContent = mfs.xpos&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.feats != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;msd&quot;)&#10; innerF.textContent = mfs.feats&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.misc != &quot;_&quot; &amp;&amp; mfs.misc!!.matches(Regex(&quot;^[0-9.]+$&quot;))) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;certainty&quot;)&#10; innerF.textContent = mfs.misc&#10; innerFs.appendChild(innerF)&#10; }&#10;&#10; spanList.appendChild(spanNode)&#10; }&#10; val transformerFactory = TransformerFactory.newInstance()&#10; val transformer = transformerFactory.newTransformer()&#10; transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10; transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10; transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10; val domSource = DOMSource(doc)&#10; val streamResult = StreamResult(StringWriter())&#10; transformer.transform(domSource, streamResult)&#10;&#10; return StringBuilder(streamResult.writer.toString())&#10;&#10; }&#10;&#10; private fun conlluOutput(foundry: String, docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder&#10; output =&#10; StringBuilder(&quot;# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n&quot;).append(&#10; tokenOffsetsInSentence(&#10; sentences, docId, sentence_index, real_token_index, tokens&#10; )&#10; )&#10; if (extractMetadataRegex.isNotEmpty()) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, prefix = &quot;# metadata=&quot;, postfix = &quot;\n&quot;) ?: &quot;&quot;)&#10; }&#10; var previousSpanStart = 0&#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to) {&#10; output.append(&quot;\n&quot;)&#10; sentence_index++&#10; token_index = 1&#10; output.append(&#10; tokenOffsetsInSentence(&#10; sentences, docId, sentence_index, real_token_index, tokens&#10; )&#10; )&#10; }&#10; if (extractAttributesRegex.isNotEmpty() &amp;&amp; extraFeatures[docId] != null) {&#10; for (i in previousSpanStart until span.from + 1) {&#10; if (extraFeatures[docId]?.containsKey(&quot;$i&quot;) == true) {&#10; output.append(extraFeatures[docId]!![&quot;$i&quot;])&#10; extraFeatures[docId]!!.remove(&quot;$i&quot;)&#10; }&#10; }&#10; previousSpanStart = span.from + 1&#10; }&#10; if (morpho[docId]?.containsKey(&quot;${span.from}-${span.to}&quot;) == true) {&#10; val mfs = morpho[docId]!![&quot;${span.from}-${span.to}&quot;]&#10; if (span.to &gt; texts[docId]!!.length) {&#10; span.to = texts[docId]!!.length&#10; LOGGER.warning(&#10; &quot;Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${&#10; texts[docId]!!.substring(&#10; span.from,&#10; span.to&#10; )&#10; }&quot;&#10; )&#10; }&#10; output.append(&#10; printConlluToken(&#10; token_index,&#10; texts[docId]!!.substring(span.from, span.to),&#10; mfs!!.lemma!!,&#10; mfs.upos!!,&#10; mfs.xpos!!,&#10; mfs.feats!!,&#10; mfs.head!!,&#10; mfs.deprel!!,&#10; mfs.deps!!,&#10; mfs.misc!!,&#10; columns&#10; )&#10; )&#10; } else {&#10; output.append(&#10; printConlluToken(&#10; token_index, texts[docId]!!.substring(span.from, span.to), columns = columns&#10; )&#10; )&#10; }&#10; real_token_index++&#10; }&#10; return output&#10; }&#10;&#10; private fun lmTrainingOutput(docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder&#10; output = StringBuilder()&#10; if (extractMetadataRegex.isNotEmpty()) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10; }&#10; // If no text is available (e.g., lemma-only mode), emit lemmas&#10; if (texts[docId] == null) {&#10; tokens[docId]?.forEach { span -&gt;&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]?.get(key)?.lemma&#10; output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10; }&#10; if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)&#10; return output&#10; }&#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; if (output.isNotEmpty()) {&#10; output.setCharAt(output.length - 1, '\n')&#10; } else {&#10; output.append(&quot;\n&quot;)&#10; }&#10; if (extractMetadataRegex.isNotEmpty() &amp;&amp; real_token_index &lt; tokens[docId]!!.size - 1) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10; }&#10; sentence_index++&#10; }&#10; // Bounds safety&#10; val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10; val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10; if (useLemma &amp;&amp; morpho[docId] != null) {&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]!![key]?.lemma&#10; if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10; output.append(lemmaVal)&#10; output.append(' ')&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; real_token_index++&#10; }&#10; if (output.isNotEmpty()) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; return output&#10; }&#10;&#10; private fun nowOutput(docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder = StringBuilder()&#10; &#10; // Add the text sigle prefix&#10; output.append(&quot;@@$docId &quot;)&#10; &#10; if (texts[docId] == null) {&#10; // Lemma-only fallback when original text is not loaded&#10; tokens[docId]?.forEach { span -&gt;&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10; output.append(&quot; &lt;p&gt; &quot;)&#10; }&#10; sentence_index++&#10; }&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]?.get(key)?.lemma&#10; output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10; }&#10; if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; return output&#10; }&#10; &#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; // Replace sentence end with &lt;p&gt; tag instead of newline&#10; if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10; output.append(&quot; &lt;p&gt; &quot;)&#10; }&#10; sentence_index++&#10; }&#10; // Bounds safety&#10; val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10; val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10; if (useLemma &amp;&amp; morpho[docId] != null) {&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]!![key]?.lemma&#10; if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10; output.append(lemmaVal)&#10; output.append(' ')&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; real_token_index++&#10; }&#10; &#10; // Remove trailing space and add final newline&#10; if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; &#10; return output&#10; }&#10;&#10;&#10; private fun printConlluToken(&#10; token_index: Int,&#10; token: String,&#10; lemma: String = &quot;_&quot;,&#10; upos: String = &quot;_&quot;,&#10; xpos: String = &quot;_&quot;,&#10; feats: String = &quot;_&quot;,&#10; head: String = &quot;_&quot;,&#10; deprel: String = &quot;_&quot;,&#10; deps: String = &quot;_&quot;,&#10; misc: String = &quot;_&quot;,&#10; columns: Int = 10&#10; ): String {&#10; val myUpos = if (COMPATIBILITY_MODE &amp;&amp; upos == &quot;_&quot;) xpos else upos&#10; return when (columns) {&#10; 1 -&gt; (&quot;$token\n&quot;)&#10; 10 -&gt; (&quot;$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator&quot;)&#10; else -&gt; {&#10; val fields = listOf(&#10; token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc&#10; )&#10; fields.subList(0, min(columns, 10)).joinToString(&quot;\t&quot;, postfix = tokenSeparator)&#10; }&#10; }&#10; }&#10;&#10; private fun tokenOffsetsInSentence(&#10; sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;,&#10; docId: String,&#10; sentence_index: Int,&#10; token_index: Int,&#10; tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;&#10; ): String {&#10; if (sentences[docId] == null || sentences[docId]!!.size &lt;= sentence_index) {&#10; return &quot;&quot;&#10; }&#10; val sentenceEndOffset = sentences[docId]!![sentence_index].to&#10; var i = token_index&#10; val start_offsets_string = StringBuilder()&#10; val end_offsets_string = StringBuilder()&#10; while (tokens[docId] != null &amp;&amp; i &lt; tokens[docId]!!.size &amp;&amp; tokens[docId]!![i].to &lt;= sentenceEndOffset) {&#10; start_offsets_string.append(&quot; &quot;, tokens[docId]!![i].from)&#10; end_offsets_string.append(&quot; &quot;, tokens[docId]!![i].to)&#10; i++&#10; }&#10; return (&#10; StringBuilder() .append(&#10; &quot;# start_offsets = &quot;, tokens[docId]!![token_index].from, start_offsets_string, &quot;\n&quot;,&#10; &quot;# end_offsets = &quot;, sentenceEndOffset, end_offsets_string, &quot;\n&quot;&#10; ).toString())&#10; }&#10;&#10; private fun extractSpans(spans: NodeList): Array&lt;Span&gt; {&#10; val list = ArrayList&lt;Span&gt;()&#10; IntStream.range(0, spans.length).forEach { idx -&gt;&#10; val node = spans.item(idx)&#10; if (node is Element) {&#10; val fromAttr = node.getAttribute(&quot;from&quot;)&#10; val toAttr = node.getAttribute(&quot;to&quot;)&#10; if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {&#10; LOGGER.warning(&quot;Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'&quot;)&#10; } else {&#10; try {&#10; val from = Integer.parseInt(fromAttr)&#10; val to = Integer.parseInt(toAttr)&#10; list.add(Span(from, to))&#10; } catch (e: NumberFormatException) {&#10; LOGGER.warning(&quot;Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}&quot;)&#10; }&#10; }&#10; }&#10; }&#10; return list.toTypedArray()&#10; }&#10;&#10; private fun extractMorphoSpans(&#10; fsSpans: NodeList&#10; ): MutableMap&lt;String, MorphoSpan&gt; {&#10; val UNKNOWN = Regex(&quot;(UNKNOWN|&lt;unknown&gt;)&quot;)&#10; val res: MutableMap&lt;String, MorphoSpan&gt; = HashMap()&#10; IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -&gt; node is Element &amp;&amp; node.getAttribute(&quot;type&quot;) != &quot;alt&quot; }.forEach { node -&gt;&#10; val features = (node as Element).getElementsByTagName(&quot;f&quot;)&#10; val fs = MorphoSpan()&#10; val fromTo = &quot;${node.getAttribute(&quot;from&quot;)}-${node.getAttribute(&quot;to&quot;)}&quot;&#10; IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -&gt;&#10; val attr = (feature as Element).getAttribute(&quot;name&quot;)&#10; val value = feature.textContent.trim()&#10; if (value.isEmpty()) return@forEach&#10; when (attr) {&#10; &quot;lemma&quot; -&gt; if(fs.lemma == &quot;_&quot;) fs.lemma = value.replace(UNKNOWN, &quot;--&quot;)&#10; &quot;upos&quot; -&gt; fs.upos = value&#10; &quot;xpos&quot;, &quot;ctag&quot;, &quot;pos&quot; -&gt; if(fs.xpos == &quot;_&quot;) fs.xpos = value.replace(UNKNOWN, &quot;--&quot;)&#10; &quot;feats&quot;, &quot;msd&quot; -&gt; if(fs.feats == &quot;_&quot; ) fs.feats = value&#10; &quot;type&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats = feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent.trim()&#10; // &quot;subtype&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats += &quot;:&quot; + feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent&#10; &quot;certainty&quot; -&gt; if(fs.misc == &quot;_&quot;) fs.misc = value&#10; }&#10; }&#10; res[fromTo] = fs&#10; }&#10; return res&#10; }&#10;&#10; private fun extractSentenceSpans(spans: NodeList): Array&lt;Span&gt; {&#10; return IntStream.range(0, spans.length).mapToObj(spans::item)&#10; .filter { node -&gt; node is Element &amp;&amp; node.getElementsByTagName(&quot;f&quot;).item(0).textContent.equals(&quot;s&quot;) }&#10; .map { node -&gt;&#10; Span(&#10; Integer.parseInt((node as Element).getAttribute(&quot;from&quot;)), Integer.parseInt(node.getAttribute(&quot;to&quot;))&#10; )&#10; }.toArray { size -&gt; arrayOfNulls(size) }&#10; }&#10;&#10; /*&#10; &lt;span id=&quot;s15&quot; from=&quot;370&quot; to=&quot;394&quot; l=&quot;5&quot;&gt;&#10; &lt;fs type=&quot;struct&quot; xmlns=&quot;http://www.tei-c.org/ns/1.0&quot;&gt;&#10; &lt;f name=&quot;name&quot;&gt;posting&lt;/f&gt;&#10; &lt;f name=&quot;attr&quot;&gt;&#10; &lt;fs type=&quot;attr&quot;&gt;&#10; &lt;f name=&quot;id&quot;&gt;i.10894_1_3&lt;/f&gt;&#10; &lt;f name=&quot;indentLevel&quot;&gt;0&lt;/f&gt;&#10; &lt;f name=&quot;who&quot;&gt;WU00000000&lt;/f&gt;&#10; &lt;/fs&gt;&#10; &lt;/f&gt;&#10; &lt;/fs&gt;&#10; &lt;/span&gt;&#10;&#10; */&#10; private fun extractMiscSpans(spans: NodeList): MutableMap&lt;String, String&gt; {&#10; val miscLocal: MutableMap&lt;String, String&gt; = HashMap()&#10;&#10; IntStream.range(0, spans.length).mapToObj(spans::item)&#10; .filter { node -&gt;&#10; node is Element&#10; &amp;&amp; node.getElementsByTagName(&quot;f&quot;).length &gt; 1&#10; &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(0) as Element).getAttribute(&quot;name&quot;).equals(&quot;name&quot;)&#10; &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getAttribute(&quot;name&quot;).equals(&quot;attr&quot;)&#10; }&#10; .forEach { node -&gt;&#10; if (node == null) return@forEach&#10; val elementName = (node as Element).getElementsByTagName(&quot;f&quot;).item(0).textContent.trim()&#10; val from = node.getAttribute(&quot;from&quot;)&#10; val attributes = (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getElementsByTagName(&quot;f&quot;)&#10; val res = StringBuilder()&#10; IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -&gt;&#10; val attrName = &quot;$elementName/${(attr as Element).getAttribute(&quot;name&quot;)}&quot;&#10; if (attrName.matches(Regex(extractAttributesRegex))) {&#10; res.append(&quot;# $attrName = ${attr.textContent}\n&quot;)&#10; //LOGGER.info(&quot;&quot; + from + &quot;: $attrName = &quot; + attr.textContent)&#10; }&#10;&#10; }&#10; if (res.isNotEmpty()) {&#10; if (miscLocal.containsKey(from)) {&#10; // LOGGER.info(&quot;ADDING TO $from: ${miscLocal[from]}&quot;)&#10; miscLocal[from] += res.toString()&#10; } else {&#10; miscLocal[from] = res.toString()&#10; }&#10; }&#10; }&#10; return miscLocal&#10; }&#10;&#10;&#10; class Span(var from: Int, var to: Int)&#10;&#10; class MorphoSpan(&#10; var lemma: String? = &quot;_&quot;,&#10; var upos: String? = &quot;_&quot;,&#10; var xpos: String? = &quot;_&quot;,&#10; var feats: String? = &quot;_&quot;,&#10; var head: String? = &quot;_&quot;,&#10; var deprel: String? = &quot;_&quot;,&#10; var deps: String? = &quot;_&quot;,&#10; var misc: String? = &quot;_&quot;&#10; )&#10;&#10;}&#10;&#10;fun main(args: Array&lt;String&gt;): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args))&#10;&#10;fun debug(args: Array&lt;String&gt;): Int {&#10; return (CommandLine(KorapXmlTool()).execute(*args))&#10;}&#10;&#10;enum class OutputFormat {&#10; CONLLU, WORD2VEC, KORAPXML, NOW&#10;}&#10;&#10;object ConlluOutputFormat {&#10; const val NAME = &quot;conllu&quot;&#10;}&#10;&#10;object Word2VecOutputFormat {&#10; const val NAME = &quot;word2vec&quot;&#10;}&#10;&#10;object KorapXmlOutputFormat {&#10; const val NAME = &quot;korapxml&quot;&#10;}&#10;&#10;object NowOutputFormat {&#10; const val NAME = &quot;now&quot;&#10;}&#10;" />
<option name="updatedContent" value="package de.ids_mannheim.korapxmltools&#10;&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries&#10;import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries&#10;import org.apache.commons.compress.archivers.zip.Zip64Mode&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveEntry&#10;import org.w3c.dom.Document&#10;import org.w3c.dom.Element&#10;import org.w3c.dom.NodeList&#10;import org.xml.sax.InputSource&#10;import org.xml.sax.SAXParseException&#10;import picocli.CommandLine&#10;import picocli.CommandLine.*&#10;import java.io.File&#10;import java.io.FileOutputStream&#10;import java.io.InputStream&#10;import java.io.StringWriter&#10;import java.lang.Integer.parseInt&#10;import java.util.*&#10;import java.util.concurrent.Callable&#10;import java.util.concurrent.ConcurrentHashMap&#10;import java.util.concurrent.Executors&#10;import java.util.concurrent.atomic.AtomicLong&#10;import java.util.logging.ConsoleHandler&#10;import java.util.logging.Level&#10;import java.util.logging.LogManager&#10;import java.util.logging.Logger&#10;import java.util.regex.Matcher&#10;import java.util.regex.Pattern&#10;import java.util.stream.IntStream&#10;import java.util.zip.ZipEntry&#10;&#10;import java.util.zip.ZipFile&#10;import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream&#10;import javax.xml.parsers.DocumentBuilder&#10;import javax.xml.parsers.DocumentBuilderFactory&#10;import javax.xml.transform.OutputKeys&#10;import javax.xml.transform.TransformerFactory&#10;import javax.xml.transform.dom.DOMSource&#10;import javax.xml.transform.stream.StreamResult&#10;import kotlin.math.min&#10;import kotlin.system.exitProcess&#10;&#10;val ZIP_ENTRY_UNIX_MODE = parseInt(&quot;644&quot;, 8)&#10;&#10;@Command(&#10; name = &quot;KorapXmlTool&quot;,&#10; mixinStandardHelpOptions = true,&#10; version = [&quot;KorapXmlTool 2.0-beta-02&quot;],&#10; description = [&quot;Converts KorAP-XML &lt;https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml&gt; base or &quot; +&#10; &quot;morpho zips to (annotated) CoNLL(-U) format with all information necessary for &quot; +&#10; &quot;reconstruction in comment lines.&quot;]&#10;)&#10;&#10;class KorapXmlTool : Callable&lt;Int&gt; {&#10; val COMPATIBILITY_MODE = System.getenv(&quot;COMPATIBILITY_MODE&quot;) != null&#10;&#10; @Spec lateinit var spec : Model.CommandSpec&#10;&#10; @Parameters(arity = &quot;1..*&quot;, description = [&quot;At least one zip file name&quot;])&#10; var zipFileNames: Array&lt;String&gt;? = null&#10;&#10; @Option(&#10; names = [&quot;-f&quot;, &quot;--output-format&quot;],&#10; description = [&quot;Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}&quot;,&#10; &quot;conllu: CoNLL-U format&quot;,&#10; &quot;korapxml, xml, zip: KorAP-XML format zip&quot;,&#10; &quot;word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines&quot;,&#10; &quot;now, NOW: NOW corpus export format: w2v-like format with &lt;p&gt; tags for sentence ends and @@&lt;text-sigle&gt; prefix&quot;,&#10; ],&#10; converter = [OutputFormatConverter::class]&#10; )&#10; var outputFormat: OutputFormat = OutputFormat.CONLLU&#10; class OutputFormatConverter : ITypeConverter&lt;OutputFormat&gt; {&#10; override fun convert(value: String?): OutputFormat {&#10; return when (value?.lowercase(Locale.getDefault())) {&#10; &quot;conllu&quot;, &quot;conll&quot; -&gt; OutputFormat.CONLLU&#10; &quot;word2vec&quot;, &quot;w2v&quot; -&gt; OutputFormat.WORD2VEC&#10; &quot;korapxml&quot;, &quot;korap&quot;, &quot;xml&quot;, &quot;zip&quot; -&gt; OutputFormat.KORAPXML&#10; &quot;now&quot;, &quot;NOW&quot; -&gt; OutputFormat.NOW&#10; else -&gt; throw IllegalArgumentException(&quot;Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(&quot;, &quot;) { it.name }}&quot;)&#10; }&#10; }&#10; }&#10;&#10; @Option(&#10; names = [&quot;--sigle-pattern&quot;, &quot;-p&quot;],&#10; paramLabel = &quot;PATTERN&quot;,&#10; description = [&quot;Extract only documents with sigle matching the pattern (regex)&quot;]&#10; )&#10; var siglePattern: String? = null&#10;&#10; @Option(&#10; names = [&quot;--extract-attributes-regex&quot;, &quot;-e&quot;],&#10; paramLabel = &quot;REGEX&quot;,&#10; description = [&quot;Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.&quot;,&#10; &quot;Example: -e '(posting/id|div/id)'&quot;]&#10; )&#10; var extractAttributesRegex: String = &quot;&quot;&#10;&#10; @Option(&#10; names = [&quot;--s-bounds-from-morpho&quot;], description = [&quot;Not yet implemented: s bounds from morpho&quot;]&#10; )&#10; var sBoundsFromMorpho: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--log&quot;, &quot;-l&quot;],&#10; paramLabel = &quot;LEVEL&quot;,&#10; description = [&quot;Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${&quot;$&quot;}{DEFAULT-VALUE}])&quot;]&#10; )&#10; var logLevel: String = &quot;WARNING&quot;&#10;&#10; @Option(&#10; names = [&quot;--columns&quot;, &quot;-c&quot;],&#10; paramLabel = &quot;NUMBER&quot;,&#10; description = [&quot;Number of columns. 1 means just the token. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;, &quot;Possible values: 1-10&quot;]&#10; )&#10; var columns: Int = 10&#10;&#10; @Option(&#10; names = [&quot;--word2vec&quot;, &quot;-w&quot;],&#10; description = [&quot;Print text in LM training format: tokens separated by space, sentences separated by newline&quot;,&#10; &quot;Deprecated: use -f word2vec&quot;]&#10; )&#10; fun setWord2Vec(word2vec: Boolean) {&#10; if (word2vec) {&#10; outputFormat = OutputFormat.WORD2VEC&#10; }&#10; }&#10;&#10; @Option(&#10; names = [&quot;--exclude-zip-glob&quot;],&#10; paramLabel = &quot;GLOB&quot;,&#10; description = [&#10; &quot;Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').&quot;,&#10; &quot;May be repeated. Applied to basenames, not full paths.&quot;&#10; ]&#10; )&#10; var excludeZipGlobs: MutableList&lt;String&gt; = mutableListOf()&#10;&#10; @Option(&#10; names = [&quot;--token-separator&quot;, &quot;-s&quot;],&#10; paramLabel = &quot;STRING&quot;,&#10; defaultValue = &quot;\n&quot;,&#10; description = [&quot;Token separator. Default: new-line for CoNLL-U, space for word2vec format.&quot;]&#10; )&#10; var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) &quot; &quot; else &quot;\n&quot;&#10;&#10; @Option(names = [&quot;--offsets&quot;], description = [&quot;Not yet implemented: offsets&quot;])&#10; var offsets: Boolean = false&#10;&#10; @Option(names = [&quot;--comments&quot;, &quot;-C&quot;], description = [&quot;Not yet implemented: comments&quot;])&#10; var comments: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--extract-metadata-regex&quot;, &quot;-m&quot;],&#10; paramLabel = &quot;REGEX&quot;,&#10; description = [&quot;Extract metadata regexes.\nExample: -m '&lt;textSigle&gt;([^&lt;]+)' -m '&lt;creatDate&gt;([^&lt;]+)'&quot;]&#10; )&#10; var extractMetadataRegex: MutableList&lt;String&gt; = mutableListOf()&#10;&#10; @Option(&#10; names = [&quot;--annotate-with&quot;, &quot;-A&quot;],&#10; paramLabel = &quot;COMMAND&quot;,&#10; description = [&quot;Pipe output through command&quot;]&#10; )&#10; var annotateWith: String = &quot;&quot;&#10;&#10; @Option(&#10; names = [&quot;--threads&quot;, &quot;-T&quot;],&#10; paramLabel = &quot;THREADS&quot;,&#10; description = [&quot;Maximum number of threads to use. Default: ${&quot;$&quot;}{DEFAULT-VALUE}&quot;]&#10; )&#10; var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2&#10; fun setThreads(threads: Int) {&#10; if (threads &lt; 1) {&#10; throw ParameterException(spec.commandLine(), String.format(&quot;Invalid value `%d' for option '--threads': must be at least 1&quot;, threads))&#10; }&#10; this.maxThreads = threads&#10; System.setProperty(&quot;java.util.concurrent.ForkJoinPool.common.parallelism&quot;, threads.toString())&#10; }&#10;&#10; @Option(&#10; names = [&quot;--zip-parallelism&quot;],&#10; paramLabel = &quot;N&quot;,&#10; description = [&quot;Maximum number of zip files to process concurrently. Defaults to --threads.&quot;]&#10; )&#10; var zipParallelism: Int? = null&#10;&#10; @Option(&#10; names = [&quot;--sequential&quot;],&#10; description = [&#10; &quot;Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now).&quot;&#10; ]&#10; )&#10; var sequentialInZip: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--overwrite&quot;, &quot;-o&quot;],&#10; description = [&quot;Overwrite existing files&quot;]&#10; )&#10; var overwrite: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--mem-stats-interval&quot;],&#10; paramLabel = &quot;N&quot;,&#10; description = [&quot;Log memory and cache statistics every N processed documents (0 disables; default: 0)&quot;]&#10; )&#10; var memStatsInterval: Int = 0&#10;&#10; @Option(&#10; names = [&quot;--lemma&quot;],&#10; description = [&quot;In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)&quot;]&#10; )&#10; var useLemma: Boolean = false&#10;&#10; @Option(&#10; names = [&quot;--lemma-only&quot;],&#10; description = [&#10; &quot;Do not load texts from data.xml and output only lemmas (requires morpho.xml).&quot;,&#10; &quot;Only valid with -f word2vec or -f now; implies --lemma.&quot;&#10; ]&#10; )&#10; var lemmaOnly: Boolean = false&#10;&#10; private var taggerName: String? = null&#10; private var taggerModel: String? = null&#10; @Option(&#10; names = [&quot;--tag-with&quot;, &quot;-t&quot;],&#10; paramLabel = &quot;TAGGER:MODEL&quot;,&#10; description = [&quot;Specify a tagger and a model: ${taggerFoundries}:&lt;path/to/model&gt;.&quot;]&#10; )&#10; fun setTagWith(tagWith: String) {&#10; val pattern: Pattern = Pattern.compile(&quot;(${taggerFoundries}):(.+)&quot;)&#10; val matcher: Matcher = pattern.matcher(tagWith)&#10; if (!matcher.matches()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value `%s' for option '--tag-with': &quot;+&#10; &quot;value does not match the expected pattern ${taggerFoundries}:&lt;path/to/model&gt;&quot;, tagWith))&#10; } else {&#10; taggerName = matcher.group(1)&#10; taggerModel = matcher.group(2)&#10; if (!File(taggerModel).exists()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value for option '--tag-with':&quot;+&#10; &quot;model file '%s' does not exist&quot;, taggerModel, taggerModel))&#10; }&#10; }&#10; }&#10;&#10; private var parserName: String? = null&#10; private var parserModel: String? = null&#10; @Option(&#10; names = [&quot;--parse-with&quot;, &quot;-P&quot;],&#10; paramLabel = &quot;parser:MODEL&quot;,&#10; description = [&quot;Specify a parser and a model: ${parserFoundries}:&lt;path/to/model&gt;.&quot;]&#10; )&#10; fun setParseWith(parseWith: String) {&#10; val pattern: Pattern = Pattern.compile(&quot;(${parserFoundries}):(.+)&quot;)&#10; val matcher: Matcher = pattern.matcher(parseWith)&#10; if (!matcher.matches()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value `%s' for option '--parse-with': &quot;+&#10; &quot;value does not match the expected pattern (${parserFoundries}):&lt;path/to/model&gt;&quot;, parseWith))&#10; } else {&#10; parserName = matcher.group(1)&#10; parserModel = matcher.group(2)&#10; if (!File(parserModel).exists()) {&#10; throw ParameterException(spec.commandLine(),&#10; String.format(&quot;Invalid value for option '--parse-with':&quot;+&#10; &quot;model file '%s' does not exist&quot;, parserModel, parserModel))&#10; }&#10; }&#10; }&#10;&#10;&#10; override fun call(): Int {&#10; val handler = ConsoleHandler()&#10; LogManager.getLogManager().reset()&#10; handler.formatter = ColoredFormatter()&#10;&#10; for (handler in LOGGER.handlers) {&#10; LOGGER.removeHandler(handler)&#10; }&#10; LOGGER.addHandler(handler)&#10; LOGGER.level = try {&#10; Level.parse(logLevel.uppercase(Locale.getDefault()))&#10; } catch (e: IllegalArgumentException) {&#10; LOGGER.warning(&quot;Invalid log level: $logLevel. Defaulting to WARNING.&quot;)&#10; Level.WARNING&#10; }&#10;&#10; if (lemmaOnly) {&#10; useLemma = true&#10; if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10; throw ParameterException(spec.commandLine(), &quot;--lemma-only is supported only with -f word2vec or -f now&quot;)&#10; }&#10; }&#10;&#10; LOGGER.info(&quot;Processing zip files: &quot; + zipFileNames!!.joinToString(&quot;, &quot;))&#10;&#10; korapxml2conllu(zipFileNames!!)&#10; return 0&#10; }&#10;&#10; private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name)&#10;&#10; private var annotationWorkerPool : AnnotationWorkerPool? = null&#10; // Shared executor for entry-level parallelism across all zips&#10; private var entryExecutor: java.util.concurrent.ExecutorService? = null&#10;&#10; val texts: ConcurrentHashMap&lt;String, NonBmpString&gt; = ConcurrentHashMap()&#10; val sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10; val tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt; = ConcurrentHashMap()&#10; val morpho: ConcurrentHashMap&lt;String, MutableMap&lt;String, MorphoSpan&gt;&gt; = ConcurrentHashMap()&#10; val fnames: ConcurrentHashMap&lt;String, String&gt; = ConcurrentHashMap()&#10; val metadata: ConcurrentHashMap&lt;String, Array&lt;String&gt;&gt; = ConcurrentHashMap()&#10; val extraFeatures: ConcurrentHashMap&lt;String, MutableMap&lt;String, String&gt;&gt; = ConcurrentHashMap()&#10; private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0)&#10; var taggerToolBridges: ConcurrentHashMap&lt;Long, TaggerToolBridge?&gt; = ConcurrentHashMap()&#10; var parserToolBridges: ConcurrentHashMap&lt;Long, ParserToolBridge?&gt; = ConcurrentHashMap()&#10;&#10; // Zip progress tracking for logging (zipNumber/zipTotal)&#10; private val zipOrdinals: ConcurrentHashMap&lt;String, Int&gt; = ConcurrentHashMap()&#10; private var totalZips: Int = 0&#10; private val zipSizes: ConcurrentHashMap&lt;String, Long&gt; = ConcurrentHashMap()&#10; private val processedZipBytes: AtomicLong = AtomicLong(0)&#10; private var totalZipBytes: Long = 0&#10; private var startTimeMillis: Long = 0&#10;&#10; var dbFactory: DocumentBuilderFactory? = null&#10; var dBuilder: DocumentBuilder? = null&#10; var morphoZipOutputStream: ZipArchiveOutputStream? = null&#10;&#10; fun String.hasCorrespondingBaseZip(): Boolean {&#10; if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return false&#10; val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10; return File(baseZip).exists()&#10; }&#10;&#10; fun String.correspondingBaseZip(): String? {&#10; if (!this.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) return null&#10; val baseZip = this.replace(Regex(&quot;\\.([^/.]+)\\.zip$&quot;), &quot;.zip&quot;)&#10; return if (File(baseZip).exists()) baseZip else null&#10; }&#10;&#10; fun korapxml2conllu(args: Array&lt;String&gt;) {&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotateWith.isNotEmpty()) {&#10; LOGGER.severe(&quot;Shell command annotation is not yet supported with output format $outputFormat&quot;)&#10; exitProcess(1)&#10; }&#10; // Initialize shared entry executor (used inside each zip)&#10; entryExecutor = Executors.newFixedThreadPool(maxThreads)&#10;&#10; if (annotateWith.isNotEmpty()) {&#10; annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER)&#10; }&#10;&#10; var zips: Array&lt;String&gt; = args&#10; if (excludeZipGlobs.isNotEmpty()) {&#10; val before = zips.size&#10; val patterns = excludeZipGlobs.map { globToRegex(it) }&#10; zips = zips.filter { zipPath -&gt;&#10; val base = File(zipPath).name&#10; patterns.none { rx -&gt; rx.matches(base) }&#10; }.toTypedArray()&#10; val excluded = before - zips.size&#10; if (excluded &gt; 0) {&#10; LOGGER.info(&quot;Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(&quot;, &quot;)}&quot;)&#10; }&#10; }&#10; // Initialize zip progress tracking and sizes&#10; startTimeMillis = System.currentTimeMillis()&#10; processedZipBytes.set(0)&#10; totalZips = zips.size&#10; zipOrdinals.clear()&#10; zipSizes.clear()&#10; zips.forEach { zip -&gt; zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } }&#10; totalZipBytes = zipSizes.values.sum()&#10; // In lemma-only mode, process largest zips first&#10; if (lemmaOnly) {&#10; zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray()&#10; }&#10; zips.forEachIndexed { index, zip -&gt; zipOrdinals[zip] = index + 1 }&#10;&#10; // Log zip order with sizes so the user can verify sorting&#10; val totalHuman = humanBytes(totalZipBytes)&#10; LOGGER.info(&quot;Zip processing order (${zips.size} file(s), total ${totalHuman}):&quot;)&#10; zips.forEachIndexed { idx, zip -&gt;&#10; val size = zipSizes[zip] ?: 0L&#10; LOGGER.info(String.format(Locale.ROOT, &quot;%d/%d: %s (%s)&quot;, idx + 1, zips.size, zip, humanBytes(size)))&#10; }&#10;&#10; if (sequentialInZip) {&#10; if (outputFormat != OutputFormat.WORD2VEC &amp;&amp; outputFormat != OutputFormat.NOW) {&#10; throw ParameterException(spec.commandLine(), &quot;--sequential is supported only with -f word2vec or -f now&quot;)&#10; }&#10; }&#10;&#10; if (maxThreads &gt; 1) {&#10; val foundry = getFoundryFromZipFileNames(zips)&#10; val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1)&#10; LOGGER.info(&quot;Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) &quot;sequential&quot; else &quot;parallel&quot;}&quot;)&#10; processZipsWithQueue(zips, foundry, parallelism)&#10; } else {&#10; LOGGER.info(&quot;Processing zip files sequentially&quot;)&#10; Arrays.stream(zips).forEachOrdered { zipFilePath -&gt;&#10; processZipFileSequentially((zipFilePath ?: &quot;&quot;).toString(), getFoundryFromZipFileNames(zips))&#10; }&#10; }&#10;&#10; if (annotationWorkerPool != null) {&#10; LOGGER.info(&quot;closing worker pool&quot;)&#10; annotationWorkerPool?.close()&#10; }&#10; // Shutdown entry executor&#10; entryExecutor?.shutdown()&#10; }&#10;&#10; private fun processZipsWithQueue(zips: Array&lt;String&gt;, foundry: String, parallelism: Int) {&#10; val queue: java.util.concurrent.BlockingQueue&lt;String&gt; = java.util.concurrent.LinkedBlockingQueue()&#10; zips.forEach { queue.put(it) }&#10; val executor = Executors.newFixedThreadPool(parallelism)&#10; val active = java.util.concurrent.atomic.AtomicInteger(0)&#10; repeat(parallelism) {&#10; executor.submit {&#10; active.incrementAndGet()&#10; try {&#10; while (true) {&#10; val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS)&#10; if (zipPath == null) {&#10; if (queue.isEmpty()) break else continue&#10; }&#10; if (sequentialInZip) {&#10; processZipFileSequentially(zipPath, foundry)&#10; } else {&#10; processZipFile(zipPath, foundry)&#10; }&#10; }&#10; } finally {&#10; active.decrementAndGet()&#10; }&#10; }&#10; }&#10; executor.shutdown()&#10; try {&#10; executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS)&#10; } catch (ie: InterruptedException) {&#10; Thread.currentThread().interrupt()&#10; }&#10; }&#10;&#10; // Convert a shell-like glob to a Regex: '*' -&gt; &quot;.*&quot;, '?' -&gt; '.', anchored full match&#10; private fun globToRegex(glob: String): Regex {&#10; val sb = StringBuilder(&quot;^&quot;)&#10; glob.forEach { ch -&gt;&#10; when (ch) {&#10; '*' -&gt; sb.append(&quot;.*&quot;)&#10; '?' -&gt; sb.append('.')&#10; '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -&gt; sb.append('\\').append(ch)&#10; else -&gt; sb.append(ch)&#10; }&#10; }&#10; sb.append('$')&#10; return Regex(sb.toString())&#10; }&#10;&#10;&#10; private fun getTokenSpansFromMorho(morpho: MutableMap&lt;String, MorphoSpan&gt;): Array&lt;Span&gt; {&#10; return morpho.keys.map { key -&gt;&#10; val fromTo = key.split(&quot;-&quot;)&#10; Span(fromTo[0].toInt(), fromTo[1].toInt())&#10; }.sortedBy {&#10; it.from&#10; }.toTypedArray()&#10; }&#10;&#10; private fun getFoundryFromZipFileName(zipFileName: String): String {&#10; if (!zipFileName.matches(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;))) {&#10; return &quot;base&quot;&#10; }&#10; return zipFileName.replace(Regex(&quot;.*\\.([^/.]+)\\.zip$&quot;), &quot;$1&quot;)&#10; }&#10;&#10; private fun getFoundryFromZipFileNames(zipFileNames: Array&lt;String&gt;): String {&#10; for (zipFileName in zipFileNames) {&#10; val foundry = getFoundryFromZipFileName(zipFileName)&#10; if (foundry != &quot;base&quot;) {&#10; return foundry&#10; }&#10; }&#10; return &quot;base&quot;&#10; }&#10;&#10; private fun processZipFile(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10; LOGGER.info(&quot;Foundry: $foundry $dbFactory&quot;)&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; dbFactory == null) {&#10; var targetFoundry = &quot;base&quot;&#10; if (taggerName != null) {&#10; val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10; if (tagger != null) {&#10; targetFoundry = tagger.foundry&#10; }&#10; } else if (parserName != null) {&#10; targetFoundry = parserName!!&#10; }&#10; dbFactory = DocumentBuilderFactory.newInstance()&#10; dBuilder = dbFactory!!.newDocumentBuilder()&#10; val outputMorphoZipFileName =&#10; if (parserName != null)&#10; zipFilePath.replace(Regex(&quot;(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$&quot;), &quot;.&quot;.plus(parserName).plus(&quot;.zip&quot;))&#10; else&#10; zipFilePath.replace(Regex(&quot;\\.zip$&quot;), &quot;.&quot;.plus(targetFoundry).plus(&quot;.zip&quot;))&#10; if (File(outputMorphoZipFileName).exists() &amp;&amp; !overwrite) {&#10; LOGGER.severe(&quot;Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.&quot;)&#10; exitProcess(1)&#10; }&#10; val fileOutputStream = FileOutputStream(outputMorphoZipFileName)&#10; morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply {&#10; setUseZip64(Zip64Mode.Always)&#10; }&#10; }&#10; if (zipFilePath.hasCorrespondingBaseZip()) {&#10; val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10; // Process related zips one after another to keep the ZipFile lifetime strictly bounded&#10; relatedZips.forEach { zip -&gt;&#10; ZipFile(zip).use { zipFile -&gt;&#10; processZipEntriesWithPool(zipFile, foundry, true)&#10; }&#10; }&#10; } else {&#10; ZipFile(zipFilePath).use { zipFile -&gt;&#10; processZipEntriesWithPool(zipFile, foundry, false)&#10; }&#10; }&#10; if (outputFormat == OutputFormat.KORAPXML) {&#10; morphoZipOutputStream!!.close()&#10; }&#10; logZipProgress(zipFilePath)&#10; }&#10;&#10; private fun processZipFileSequentially(zipFilePath: String, foundry: String = &quot;base&quot;) {&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; LOGGER.info(&quot;Processing zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}&quot;)&#10; if (zipFilePath.hasCorrespondingBaseZip()) {&#10; // Process the two related zips strictly sequentially to limit memory growth&#10; val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!)&#10; zips.forEach { zip -&gt;&#10; ZipFile(zip).use { zipFile -&gt;&#10; // Iterate entries in a deterministic order to keep related files close together&#10; zipFile.stream()&#10; .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10; .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10; .forEachOrdered { zipEntry -&gt;&#10; processZipEntry(zipFile, foundry, zipEntry, true)&#10; }&#10; }&#10; }&#10; } else {&#10; ZipFile(zipFilePath).use { zipFile -&gt;&#10; zipFile.stream()&#10; .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains(&quot;header.xml&quot;) }&#10; .sorted(Comparator.comparing&lt;ZipEntry, String&gt; { it.name })&#10; .forEachOrdered { zipEntry -&gt;&#10; processZipEntry(zipFile, foundry, zipEntry, false)&#10; }&#10; }&#10; }&#10; logZipProgress(zipFilePath)&#10; }&#10;&#10; private fun logZipProgress(zipFilePath: String) {&#10; try {&#10; val size = zipSizes[zipFilePath] ?: 0L&#10; val done = processedZipBytes.addAndGet(size)&#10; val total = if (totalZipBytes &gt; 0) totalZipBytes else 1L&#10; val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1)&#10; val speedBytesPerSec = (done * 1000.0) / elapsedMs&#10; val remaining = (total - done).coerceAtLeast(0)&#10; val etaSeconds = if (speedBytesPerSec &gt; 0.0) (remaining / speedBytesPerSec).toLong() else -1L&#10; val ord = zipOrdinals[zipFilePath] ?: 0&#10; val pct = (done * 100.0 / total).coerceIn(0.0, 100.0)&#10; val humanSpeed = String.format(Locale.ROOT, &quot;%.2f MB/s&quot;, speedBytesPerSec / (1024.0 * 1024.0))&#10; val etaStr = if (etaSeconds &gt;= 0) formatDuration(etaSeconds) else &quot;unknown&quot;&#10; LOGGER.info(&#10; &quot;Finished zip ${if (ord&gt;0) ord else &quot;?&quot;}/$totalZips: ${zipFilePath} &quot; +&#10; &quot;(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, &quot;%.1f&quot;, pct)}%%, &quot; +&#10; &quot;ETA ${etaStr} at ${humanSpeed}&quot;&#10; )&#10; } catch (e: Exception) {&#10; LOGGER.fine(&quot;Failed to log zip progress for $zipFilePath: ${e.message}&quot;)&#10; }&#10; }&#10;&#10; private fun humanBytes(bytes: Long): String {&#10; if (bytes &lt; 1024) return &quot;$bytes B&quot;&#10; val kb = bytes / 1024.0&#10; if (kb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f KB&quot;, kb)&#10; val mb = kb / 1024.0&#10; if (mb &lt; 1024) return String.format(Locale.ROOT, &quot;%.1f MB&quot;, mb)&#10; val gb = mb / 1024.0&#10; return String.format(Locale.ROOT, &quot;%.1f GB&quot;, gb)&#10; }&#10;&#10; private fun formatDuration(seconds: Long): String {&#10; var s = seconds&#10; val h = s / 3600; s %= 3600&#10; val m = s / 60; val sec = s % 60&#10; return String.format(Locale.ROOT, &quot;%02d:%02d:%02d&quot;, h, m, sec)&#10; }&#10;&#10; private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) {&#10; // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested&#10; val entries: MutableList&lt;ZipEntry&gt; = ArrayList()&#10; val enumEntries = zipFile.entries()&#10; while (enumEntries.hasMoreElements()) {&#10; val e = enumEntries.nextElement()&#10; if (extractMetadataRegex.isEmpty() &amp;&amp; e.name.contains(&quot;header.xml&quot;)) continue&#10; entries.add(e)&#10; }&#10; if (entries.isEmpty()) return&#10;&#10; // If only one thread requested, do sequential to avoid pool overhead&#10; if (maxThreads &lt;= 1) {&#10; entries.forEach { entry -&gt; processZipEntry(zipFile, foundry, entry, waitForMorpho) }&#10; return&#10; }&#10;&#10; // Submit all entry tasks to the shared executor and await completion before closing the zip&#10; val latch = java.util.concurrent.CountDownLatch(entries.size)&#10; entries.forEach { entry -&gt;&#10; entryExecutor?.execute {&#10; try {&#10; processZipEntry(zipFile, foundry, entry, waitForMorpho)&#10; } catch (t: Throwable) {&#10; LOGGER.warning(&quot;Failed to process entry ${entry.name}: ${t.message}&quot;)&#10; } finally {&#10; latch.countDown()&#10; }&#10; }&#10; }&#10; try {&#10; latch.await()&#10; } catch (ie: InterruptedException) {&#10; Thread.currentThread().interrupt()&#10; }&#10; }&#10;&#10; fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) {&#10; var foundry = _foundry&#10; var waitForMorpho = passedWaitForMorpho&#10; LOGGER.finer(&quot;Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}&quot;)&#10; if (taggerName != null &amp;&amp; !taggerToolBridges.containsKey(Thread.currentThread().threadId())) {&#10; val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge?&#10; taggerToolBridges[Thread.currentThread().threadId()] = tagger&#10; if (tagger != null) {&#10; foundry = tagger.foundry&#10; }&#10;&#10; }&#10; if (parserName != null &amp;&amp; !parserToolBridges.containsKey(Thread.currentThread().threadId())) {&#10; val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge?&#10; parserToolBridges[Thread.currentThread().threadId()] = parser&#10; if (parser != null) {&#10; foundry = &quot;$foundry dependency:${parser.foundry}&quot;&#10; LOGGER.fine(&quot;Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}&quot;)&#10; }&#10; }&#10;&#10; try {&#10; if (zipEntry.name.matches(Regex(&quot;.*(data|tokens|structure|morpho)\\.xml$&quot;))) {&#10; // Ensure the entry stream and reader are closed to avoid native memory buildup&#10; val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance()&#10; val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder()&#10; // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure&#10; if (lemmaOnly &amp;&amp; zipEntry.name.endsWith(&quot;data.xml&quot;)) {&#10; return&#10; }&#10; val doc: Document = try {&#10; zipFile.getInputStream(zipEntry).use { inputStream -&gt;&#10; XMLCommentFilterReader(inputStream, &quot;UTF-8&quot;).use { reader -&gt;&#10; dBuilder.parse(InputSource(reader))&#10; }&#10; }&#10; } catch (e: SAXParseException) {&#10; LOGGER.warning(&quot;Error parsing file: &quot; + zipEntry.name + &quot; &quot; + e.message)&#10; return&#10; }&#10;&#10; doc.documentElement.normalize()&#10; val docId: String = doc.documentElement.getAttribute(&quot;docid&quot;)&#10; if (siglePattern != null &amp;&amp; !Regex(siglePattern!!).containsMatchIn(docId)) {&#10; return&#10; }&#10; // LOGGER.info(&quot;Processing file: &quot; + zipEntry.getName())&#10; val fileName = zipEntry.name.replace(Regex(&quot;.*?/([^/]+\\.xml)$&quot;), &quot;$1&quot;)&#10; when (fileName) {&#10; &quot;data.xml&quot; -&gt; {&#10; if (!lemmaOnly) {&#10; val textsList: NodeList = doc.getElementsByTagName(&quot;text&quot;)&#10; if (textsList.length &gt; 0) {&#10; texts[docId] = NonBmpString(textsList.item(0).textContent)&#10; }&#10; }&#10; }&#10;&#10; &quot;structure.xml&quot; -&gt; {&#10; val spans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; if (extractAttributesRegex.isNotEmpty())&#10; extraFeatures[docId] = extractMiscSpans(spans)&#10; sentences[docId] = extractSentenceSpans(spans)&#10;&#10; }&#10;&#10; &quot;tokens.xml&quot; -&gt; {&#10; if (!fnames.contains(docId)) {&#10; fnames[docId] = zipEntry.name&#10; }&#10; val tokenSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; tokens[docId] = extractSpans(tokenSpans)&#10; }&#10;&#10; &quot;morpho.xml&quot; -&gt; {&#10; waitForMorpho = true&#10; fnames[docId] = zipEntry.name&#10; val fsSpans: NodeList = doc.getElementsByTagName(&quot;span&quot;)&#10; morpho[docId] = extractMorphoSpans(fsSpans)&#10; tokens[docId] = extractSpans(fsSpans)&#10; }&#10; }&#10;&#10; val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10; // For lemma-only/lemma-based word2vec/now, we can proceed without full text&#10; val textRequired = when (outputFormat) {&#10; OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10; else -&gt; true&#10; }&#10; if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10; &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10; &amp;&amp; (extractMetadataRegex.isEmpty() || metadata[docId] != null)&#10; ) {&#10; // Be quiet on INFO; per-text logs only on FINE and below&#10; LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; processText(docId, foundry)&#10; }&#10; } else if (extractMetadataRegex.isNotEmpty() &amp;&amp; zipEntry.name.matches(Regex(&quot;.*/header\\.xml$&quot;))) {&#10; //LOGGER.info(&quot;Processing header file: &quot; + zipEntry.name)&#10; val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() }&#10; val docId =&#10; Regex(&quot;&lt;textSigle&gt;([^&lt;]+)&lt;/textSigle&gt;&quot;).find(text)?.destructured?.component1()&#10; ?.replace(Regex(&quot;/&quot;), &quot;_&quot;)&#10; LOGGER.fine(&quot;Processing header file: &quot; + zipEntry.name + &quot; docId: &quot; + docId)&#10; val meta = ArrayList&lt;String&gt;()&#10; extractMetadataRegex.forEach { regex -&gt;&#10; val match = Regex(regex).find(text)&#10; if (match != null) {&#10; meta.add(match.destructured.component1())&#10; }&#10; }&#10; if (meta.isNotEmpty() &amp;&amp; docId != null) {&#10; metadata[docId] = meta.toTypedArray()&#10; val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML&#10; val textRequired = when (outputFormat) {&#10; OutputFormat.WORD2VEC, OutputFormat.NOW -&gt; !(useLemma || lemmaOnly)&#10; else -&gt; true&#10; }&#10; if ((texts[docId] != null || !textRequired) &amp;&amp; sentences[docId] != null &amp;&amp; tokens[docId] != null&#10; &amp;&amp; (!morphoRequired || morpho[docId] != null)&#10; ) {&#10; // Be quiet on INFO; per-text logs only on FINE and below&#10; LOGGER.fine(&quot;Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; processText(docId, foundry)&#10; }&#10; }&#10; }&#10; } catch (e: Exception) {&#10; e.printStackTrace()&#10; }&#10; }&#10;&#10; private fun processText(&#10; docId: String,&#10; foundry: String,&#10; ) {&#10; LOGGER.fine(&quot;Processing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; var morphoFoundry = getMorphoFoundry()&#10; val output =&#10; if (outputFormat == OutputFormat.WORD2VEC) {&#10; lmTrainingOutput(docId)&#10; } else if (outputFormat == OutputFormat.NOW) {&#10; nowOutput(docId)&#10; } else {&#10; if (taggerToolBridges[Thread.currentThread().threadId()] != null) {&#10; morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText(&#10; tokens[docId]!!,&#10; sentences[docId],&#10; texts[docId]!!&#10; )&#10;&#10; }&#10; if (parserToolBridges[Thread.currentThread().threadId()] != null) {&#10; if (morpho[docId] == null) {&#10; LOGGER.severe(&quot;No morpho data for $docId&quot;)&#10; //exitProcess(1)&#10; }&#10; LOGGER.finer(&quot;Parsing text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText(&#10; tokens[docId]!!,&#10; morpho[docId],&#10; sentences[docId],&#10; texts[docId]!!&#10; )&#10; LOGGER.finer(&quot;Parsed text: $docId in thread ${Thread.currentThread().threadId()}&quot;)&#10; }&#10; if (outputFormat == OutputFormat.KORAPXML &amp;&amp; annotationWorkerPool == null) {&#10; korapXmlOutput(getMorphoFoundry(), docId)&#10; } else {&#10; conlluOutput(foundry, docId)&#10; }&#10; }&#10;&#10; if (annotationWorkerPool != null) {&#10; annotationWorkerPool?.pushToQueue(output.append(&quot;\n# eot\n&quot;).toString())&#10; // Release internal char[] early&#10; output.setLength(0)&#10; } else if (outputFormat != OutputFormat.KORAPXML) {&#10; synchronized(System.out) {&#10; println(output.toString())&#10; }&#10; // Release internal char[] early&#10; output.setLength(0)&#10; } else {&#10; korapXmlOutput(foundry, docId)&#10; }&#10;&#10;&#10; arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -&gt;&#10; if (map === morpho) {&#10; // Clear inner map to release references early&#10; morpho[docId]?.clear()&#10; }&#10; map.remove(docId)&#10; }&#10;&#10; // Periodic GC hint after processing many docs (lightweight safeguard)&#10; if ((processedDocs.incrementAndGet() % 2000) == 0) {&#10; LOGGER.fine(&quot;Processed ${processedDocs.get()} docs – requesting GC hint&quot;)&#10; System.gc()&#10; }&#10; // Memory / cache statistics logging&#10; if (memStatsInterval &gt; 0) {&#10; val count = processedDocs.get()&#10; if (count % memStatsInterval == 0) {&#10; logMemoryStats(count)&#10; }&#10; }&#10;&#10; if (outputFormat == OutputFormat.KORAPXML) {&#10; val entryPath = if (parserName != null) docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$parserName/&quot;).plus(&quot;dependency.xml&quot;)&#10; else&#10; docId.replace(Regex(&quot;[_.]&quot;), &quot;/&quot;).plus(&quot;/$morphoFoundry/&quot;).plus(&quot;morpho.xml&quot;)&#10; val zipEntry = ZipArchiveEntry(entryPath)&#10; zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE&#10; synchronized(morphoZipOutputStream!!) {&#10; morphoZipOutputStream!!.putArchiveEntry(zipEntry)&#10; morphoZipOutputStream!!.write(output.toString().toByteArray())&#10; morphoZipOutputStream!!.closeArchiveEntry()&#10; }&#10; output.clear()&#10; }&#10; }&#10;&#10; private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: &quot;base&quot;&#10;&#10; private fun logMemoryStats(count: Int) {&#10; try {&#10; val rt = Runtime.getRuntime()&#10; val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024)&#10; val total = rt.totalMemory() / (1024 * 1024)&#10; val max = rt.maxMemory() / (1024 * 1024)&#10; LOGGER.info(&#10; &quot;MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} &quot; +&#10; &quot;maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}&quot;&#10; )&#10; } catch (e: Exception) {&#10; LOGGER.warning(&quot;Failed to log memory stats: ${e.message}&quot;)&#10; }&#10; }&#10;&#10; private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder {&#10; val doc: Document = dBuilder!!.newDocument()&#10;&#10; // Root element&#10; val layer = doc.createElement(&quot;layer&quot;)&#10; layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10; layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10; layer.setAttribute(&quot;docid&quot;, docId)&#10; doc.appendChild(layer)&#10;&#10; val spanList = doc.createElement(&quot;spanList&quot;)&#10; layer.appendChild(spanList)&#10;&#10; var i = 0&#10; var s = 0&#10; var n = 0&#10; val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split(&quot;-&quot;)[0].toInt() }&#10;&#10; sortedKeys?.forEach { spanString -&gt;&#10; val mfs = morpho[docId]?.get(spanString)&#10; val offsets = spanString.split(&quot;-&quot;)&#10; if(offsets.size != 2) {&#10; LOGGER.warning(&quot;Invalid span: $spanString in $docId&quot;)&#10; return@forEach&#10; }&#10; if (offsets[0].toInt() &gt; sentences[docId]!!.elementAt(s).to) {&#10; s++&#10; n = i&#10; }&#10; i++&#10; if (mfs!!.deprel == &quot;_&quot;) {&#10; return@forEach&#10; }&#10;&#10; val spanNode = doc.createElement(&quot;span&quot;)&#10; spanNode.setAttribute(&quot;id&quot;, &quot;s${s + 1}_n${i - n}&quot;)&#10; spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10; spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10; // rel element&#10; val rel = doc.createElement(&quot;rel&quot;)&#10; rel.setAttribute(&quot;label&quot;, mfs.deprel)&#10;&#10; // inner span element&#10; val innerSpan = doc.createElement(&quot;span&quot;)&#10; val headInt = if(mfs.head == &quot;_&quot;) 0 else parseInt(mfs.head) - 1&#10; if (headInt &lt; 0) {&#10; innerSpan.setAttribute(&quot;from&quot;, sentences[docId]!!.elementAt(s).from.toString())&#10; innerSpan.setAttribute(&quot;to&quot;, sentences[docId]!!.elementAt(s).to.toString())&#10; } else {&#10; if (headInt + n &gt;= morpho[docId]!!.size) {&#10; LOGGER.warning(&quot;Head index out of bounds: ${headInt+n} &gt;= ${morpho[docId]!!.size} in $docId&quot;)&#10; return@forEach&#10; } else {&#10; val destSpanString = sortedKeys.elementAt(headInt + n)&#10; val destOffsets = destSpanString.split(&quot;-&quot;)&#10; innerSpan.setAttribute(&quot;from&quot;, destOffsets[0])&#10; innerSpan.setAttribute(&quot;to&quot;, destOffsets[1])&#10; }&#10; }&#10; rel.appendChild(innerSpan)&#10; spanNode.appendChild(rel)&#10; spanList.appendChild(spanNode)&#10; }&#10; val transformerFactory = TransformerFactory.newInstance()&#10; val transformer = transformerFactory.newTransformer()&#10; transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10; transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10; transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10; val domSource = DOMSource(doc)&#10; val streamResult = StreamResult(StringWriter())&#10; transformer.transform(domSource, streamResult)&#10;&#10; return StringBuilder(streamResult.writer.toString())&#10; }&#10;&#10; private fun korapXmlOutput(foundry: String, docId: String): StringBuilder {&#10; return if (parserName != null) {&#10; korapXmlDependencyOutput(foundry, docId)&#10; } else {&#10; korapXmlMorphoOutput(foundry, docId)&#10; }&#10; }&#10;&#10; private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder {&#10; val doc: Document = dBuilder!!.newDocument()&#10;&#10; // Root element&#10; val layer = doc.createElement(&quot;layer&quot;)&#10; layer.setAttribute(&quot;xmlns&quot;, &quot;http://ids-mannheim.de/ns/KorAP&quot;)&#10; layer.setAttribute(&quot;version&quot;, &quot;KorAP-0.4&quot;)&#10; layer.setAttribute(&quot;docid&quot;, docId)&#10; doc.appendChild(layer)&#10;&#10; val spanList = doc.createElement(&quot;spanList&quot;)&#10; layer.appendChild(spanList)&#10;&#10; var i = 0&#10; morpho[docId]?.forEach { (spanString, mfs) -&gt;&#10; i++&#10; val offsets = spanString.split(&quot;-&quot;)&#10; val spanNode = doc.createElement(&quot;span&quot;)&#10; spanNode.setAttribute(&quot;id&quot;, &quot;t_$i&quot;)&#10; spanNode.setAttribute(&quot;from&quot;, offsets[0])&#10; spanNode.setAttribute(&quot;to&quot;, offsets[1])&#10;&#10; // fs element&#10; val fs = doc.createElement(&quot;fs&quot;)&#10; fs.setAttribute(&quot;type&quot;, &quot;lex&quot;)&#10; fs.setAttribute(&quot;xmlns&quot;, &quot;http://www.tei-c.org/ns/1.0&quot;)&#10; spanNode.appendChild(fs)&#10; val f = doc.createElement(&quot;f&quot;)&#10; f.setAttribute(&quot;name&quot;, &quot;lex&quot;)&#10; fs.appendChild(f)&#10;&#10; // Inner fs element&#10; val innerFs = doc.createElement(&quot;fs&quot;)&#10; f.appendChild(innerFs)&#10;&#10; if (mfs.lemma != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;lemma&quot;)&#10; innerF.textContent = mfs.lemma&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.upos != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;upos&quot;)&#10; innerF.textContent = mfs.upos&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.xpos != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;pos&quot;)&#10; innerF.textContent = mfs.xpos&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.feats != &quot;_&quot;) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;msd&quot;)&#10; innerF.textContent = mfs.feats&#10; innerFs.appendChild(innerF)&#10; }&#10; if (mfs.misc != &quot;_&quot; &amp;&amp; mfs.misc!!.matches(Regex(&quot;^[0-9.]+$&quot;))) {&#10; val innerF = doc.createElement(&quot;f&quot;)&#10; innerF.setAttribute(&quot;name&quot;, &quot;certainty&quot;)&#10; innerF.textContent = mfs.misc&#10; innerFs.appendChild(innerF)&#10; }&#10;&#10; spanList.appendChild(spanNode)&#10; }&#10; val transformerFactory = TransformerFactory.newInstance()&#10; val transformer = transformerFactory.newTransformer()&#10; transformer.setOutputProperty(OutputKeys.INDENT, &quot;yes&quot;)&#10; transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, &quot;no&quot;)&#10; transformer.setOutputProperty(&quot;{http://xml.apache.org/xslt}indent-amount&quot;, &quot;1&quot;)&#10; val domSource = DOMSource(doc)&#10; val streamResult = StreamResult(StringWriter())&#10; transformer.transform(domSource, streamResult)&#10;&#10; return StringBuilder(streamResult.writer.toString())&#10;&#10; }&#10;&#10; private fun conlluOutput(foundry: String, docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder&#10; output =&#10; StringBuilder(&quot;# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n&quot;).append(&#10; tokenOffsetsInSentence(&#10; sentences, docId, sentence_index, real_token_index, tokens&#10; )&#10; )&#10; if (extractMetadataRegex.isNotEmpty()) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, prefix = &quot;# metadata=&quot;, postfix = &quot;\n&quot;) ?: &quot;&quot;)&#10; }&#10; var previousSpanStart = 0&#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to) {&#10; output.append(&quot;\n&quot;)&#10; sentence_index++&#10; token_index = 1&#10; output.append(&#10; tokenOffsetsInSentence(&#10; sentences, docId, sentence_index, real_token_index, tokens&#10; )&#10; )&#10; }&#10; if (extractAttributesRegex.isNotEmpty() &amp;&amp; extraFeatures[docId] != null) {&#10; for (i in previousSpanStart until span.from + 1) {&#10; if (extraFeatures[docId]?.containsKey(&quot;$i&quot;) == true) {&#10; output.append(extraFeatures[docId]!![&quot;$i&quot;])&#10; extraFeatures[docId]!!.remove(&quot;$i&quot;)&#10; }&#10; }&#10; previousSpanStart = span.from + 1&#10; }&#10; if (morpho[docId]?.containsKey(&quot;${span.from}-${span.to}&quot;) == true) {&#10; val mfs = morpho[docId]!![&quot;${span.from}-${span.to}&quot;]&#10; if (span.to &gt; texts[docId]!!.length) {&#10; span.to = texts[docId]!!.length&#10; LOGGER.warning(&#10; &quot;Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${&#10; texts[docId]!!.substring(&#10; span.from,&#10; span.to&#10; )&#10; }&quot;&#10; )&#10; }&#10; output.append(&#10; printConlluToken(&#10; token_index,&#10; texts[docId]!!.substring(span.from, span.to),&#10; mfs!!.lemma!!,&#10; mfs.upos!!,&#10; mfs.xpos!!,&#10; mfs.feats!!,&#10; mfs.head!!,&#10; mfs.deprel!!,&#10; mfs.deps!!,&#10; mfs.misc!!,&#10; columns&#10; )&#10; )&#10; } else {&#10; output.append(&#10; printConlluToken(&#10; token_index, texts[docId]!!.substring(span.from, span.to), columns = columns&#10; )&#10; )&#10; }&#10; real_token_index++&#10; }&#10; return output&#10; }&#10;&#10; private fun lmTrainingOutput(docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder&#10; output = StringBuilder()&#10; if (extractMetadataRegex.isNotEmpty()) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10; }&#10; // If no text is available (e.g., lemma-only mode), emit lemmas&#10; if (texts[docId] == null) {&#10; tokens[docId]?.forEach { span -&gt;&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]?.get(key)?.lemma&#10; output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10; }&#10; if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)&#10; return output&#10; }&#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; if (output.isNotEmpty()) {&#10; output.setCharAt(output.length - 1, '\n')&#10; } else {&#10; output.append(&quot;\n&quot;)&#10; }&#10; if (extractMetadataRegex.isNotEmpty() &amp;&amp; real_token_index &lt; tokens[docId]!!.size - 1) {&#10; output.append(metadata[docId]?.joinToString(&quot;\t&quot;, postfix = &quot;\t&quot;) ?: &quot;&quot;)&#10; }&#10; sentence_index++&#10; }&#10; // Bounds safety&#10; val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10; val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10; if (useLemma &amp;&amp; morpho[docId] != null) {&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]!![key]?.lemma&#10; if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10; output.append(lemmaVal)&#10; output.append(' ')&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; real_token_index++&#10; }&#10; if (output.isNotEmpty()) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; return output&#10; }&#10;&#10; private fun nowOutput(docId: String): StringBuilder {&#10; var token_index = 0&#10; var real_token_index = 0&#10; var sentence_index = 0&#10; val output: StringBuilder = StringBuilder()&#10; &#10; // Add the text sigle prefix&#10; output.append(&quot;@@$docId &quot;)&#10; &#10; if (texts[docId] == null) {&#10; // Lemma-only fallback when original text is not loaded&#10; tokens[docId]?.forEach { span -&gt;&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10; output.append(&quot; &lt;p&gt; &quot;)&#10; }&#10; sentence_index++&#10; }&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]?.get(key)?.lemma&#10; output.append((lemmaVal?.takeIf { it != &quot;_&quot; } ?: &quot;_&quot;), &quot; &quot;)&#10; }&#10; if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; return output&#10; }&#10; &#10; tokens[docId]?.forEach { span -&gt;&#10; token_index++&#10; if (sentences[docId] != null &amp;&amp; (sentence_index &gt;= sentences[docId]!!.size || span.from &gt;= sentences[docId]!![sentence_index].to)) {&#10; // Replace sentence end with &lt;p&gt; tag instead of newline&#10; if (output.isNotEmpty() &amp;&amp; !output.endsWith(&quot;@@$docId &quot;)) {&#10; output.append(&quot; &lt;p&gt; &quot;)&#10; }&#10; sentence_index++&#10; }&#10; // Bounds safety&#10; val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)&#10; val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)&#10; if (useLemma &amp;&amp; morpho[docId] != null) {&#10; val key = &quot;${span.from}-${span.to}&quot;&#10; val lemmaVal = morpho[docId]!![key]?.lemma&#10; if (lemmaVal != null &amp;&amp; lemmaVal != &quot;_&quot;) {&#10; output.append(lemmaVal)&#10; output.append(' ')&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; } else {&#10; texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)&#10; output.append(' ')&#10; }&#10; real_token_index++&#10; }&#10; &#10; // Remove trailing space and add final newline&#10; if (output.isNotEmpty() &amp;&amp; output.endsWith(&quot; &quot;)) {&#10; output.deleteCharAt(output.length - 1)&#10; }&#10; &#10; return output&#10; }&#10;&#10;&#10; private fun printConlluToken(&#10; token_index: Int,&#10; token: String,&#10; lemma: String = &quot;_&quot;,&#10; upos: String = &quot;_&quot;,&#10; xpos: String = &quot;_&quot;,&#10; feats: String = &quot;_&quot;,&#10; head: String = &quot;_&quot;,&#10; deprel: String = &quot;_&quot;,&#10; deps: String = &quot;_&quot;,&#10; misc: String = &quot;_&quot;,&#10; columns: Int = 10&#10; ): String {&#10; val myUpos = if (COMPATIBILITY_MODE &amp;&amp; upos == &quot;_&quot;) xpos else upos&#10; return when (columns) {&#10; 1 -&gt; (&quot;$token\n&quot;)&#10; 10 -&gt; (&quot;$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator&quot;)&#10; else -&gt; {&#10; val fields = listOf(&#10; token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc&#10; )&#10; fields.subList(0, min(columns, 10)).joinToString(&quot;\t&quot;, postfix = tokenSeparator)&#10; }&#10; }&#10; }&#10;&#10; private fun tokenOffsetsInSentence(&#10; sentences: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;,&#10; docId: String,&#10; sentence_index: Int,&#10; token_index: Int,&#10; tokens: ConcurrentHashMap&lt;String, Array&lt;Span&gt;&gt;&#10; ): String {&#10; if (sentences[docId] == null || sentences[docId]!!.size &lt;= sentence_index) {&#10; return &quot;&quot;&#10; }&#10; val sentenceEndOffset = sentences[docId]!![sentence_index].to&#10; var i = token_index&#10; val start_offsets_string = StringBuilder()&#10; val end_offsets_string = StringBuilder()&#10; while (tokens[docId] != null &amp;&amp; i &lt; tokens[docId]!!.size &amp;&amp; tokens[docId]!![i].to &lt;= sentenceEndOffset) {&#10; start_offsets_string.append(&quot; &quot;, tokens[docId]!![i].from)&#10; end_offsets_string.append(&quot; &quot;, tokens[docId]!![i].to)&#10; i++&#10; }&#10; return (&#10; StringBuilder() .append(&#10; &quot;# start_offsets = &quot;, tokens[docId]!![token_index].from, start_offsets_string, &quot;\n&quot;,&#10; &quot;# end_offsets = &quot;, sentenceEndOffset, end_offsets_string, &quot;\n&quot;&#10; ).toString())&#10; }&#10;&#10; private fun extractSpans(spans: NodeList): Array&lt;Span&gt; {&#10; val list = ArrayList&lt;Span&gt;()&#10; IntStream.range(0, spans.length).forEach { idx -&gt;&#10; val node = spans.item(idx)&#10; if (node is Element) {&#10; val fromAttr = node.getAttribute(&quot;from&quot;)&#10; val toAttr = node.getAttribute(&quot;to&quot;)&#10; if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) {&#10; LOGGER.warning(&quot;Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'&quot;)&#10; } else {&#10; try {&#10; val from = Integer.parseInt(fromAttr)&#10; val to = Integer.parseInt(toAttr)&#10; list.add(Span(from, to))&#10; } catch (e: NumberFormatException) {&#10; LOGGER.warning(&quot;Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}&quot;)&#10; }&#10; }&#10; }&#10; }&#10; return list.toTypedArray()&#10; }&#10;&#10; private fun extractMorphoSpans(&#10; fsSpans: NodeList&#10; ): MutableMap&lt;String, MorphoSpan&gt; {&#10; val UNKNOWN = Regex(&quot;(UNKNOWN|&lt;unknown&gt;)&quot;)&#10; val res: MutableMap&lt;String, MorphoSpan&gt; = HashMap()&#10; IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -&gt; node is Element &amp;&amp; node.getAttribute(&quot;type&quot;) != &quot;alt&quot; }.forEach { node -&gt;&#10; val features = (node as Element).getElementsByTagName(&quot;f&quot;)&#10; val fs = MorphoSpan()&#10; val fromTo = &quot;${node.getAttribute(&quot;from&quot;)}-${node.getAttribute(&quot;to&quot;)}&quot;&#10; IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -&gt;&#10; val attr = (feature as Element).getAttribute(&quot;name&quot;)&#10; val value = feature.textContent.trim()&#10; if (value.isEmpty()) return@forEach&#10; when (attr) {&#10; &quot;lemma&quot; -&gt; if(fs.lemma == &quot;_&quot;) fs.lemma = value.replace(UNKNOWN, &quot;--&quot;)&#10; &quot;upos&quot; -&gt; fs.upos = value&#10; &quot;xpos&quot;, &quot;ctag&quot;, &quot;pos&quot; -&gt; if(fs.xpos == &quot;_&quot;) fs.xpos = value.replace(UNKNOWN, &quot;--&quot;)&#10; &quot;feats&quot;, &quot;msd&quot; -&gt; if(fs.feats == &quot;_&quot; ) fs.feats = value&#10; &quot;type&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats = feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent.trim()&#10; // &quot;subtype&quot; -&gt; if(fs.feats == &quot;_&quot;) fs.feats += &quot;:&quot; + feature.getElementsByTagName(&quot;symbol&quot;).item(0).attributes.getNamedItem(&quot;value&quot;).textContent&#10; &quot;certainty&quot; -&gt; if(fs.misc == &quot;_&quot;) fs.misc = value&#10; }&#10; }&#10; res[fromTo] = fs&#10; }&#10; return res&#10; }&#10;&#10; private fun extractSentenceSpans(spans: NodeList): Array&lt;Span&gt; {&#10; return IntStream.range(0, spans.length).mapToObj(spans::item)&#10; .filter { node -&gt; node is Element &amp;&amp; node.getElementsByTagName(&quot;f&quot;).item(0).textContent.equals(&quot;s&quot;) }&#10; .map { node -&gt;&#10; Span(&#10; Integer.parseInt((node as Element).getAttribute(&quot;from&quot;)), Integer.parseInt(node.getAttribute(&quot;to&quot;))&#10; )&#10; }.toArray { size -&gt; arrayOfNulls(size) }&#10; }&#10;&#10; /*&#10; &lt;span id=&quot;s15&quot; from=&quot;370&quot; to=&quot;394&quot; l=&quot;5&quot;&gt;&#10; &lt;fs type=&quot;struct&quot; xmlns=&quot;http://www.tei-c.org/ns/1.0&quot;&gt;&#10; &lt;f name=&quot;name&quot;&gt;posting&lt;/f&gt;&#10; &lt;f name=&quot;attr&quot;&gt;&#10; &lt;fs type=&quot;attr&quot;&gt;&#10; &lt;f name=&quot;id&quot;&gt;i.10894_1_3&lt;/f&gt;&#10; &lt;f name=&quot;indentLevel&quot;&gt;0&lt;/f&gt;&#10; &lt;f name=&quot;who&quot;&gt;WU00000000&lt;/f&gt;&#10; &lt;/fs&gt;&#10; &lt;/f&gt;&#10; &lt;/fs&gt;&#10; &lt;/span&gt;&#10;&#10; */&#10; private fun extractMiscSpans(spans: NodeList): MutableMap&lt;String, String&gt; {&#10; val miscLocal: MutableMap&lt;String, String&gt; = HashMap()&#10;&#10; IntStream.range(0, spans.length).mapToObj(spans::item)&#10; .filter { node -&gt;&#10; node is Element&#10; &amp;&amp; node.getElementsByTagName(&quot;f&quot;).length &gt; 1&#10; &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(0) as Element).getAttribute(&quot;name&quot;).equals(&quot;name&quot;)&#10; &amp;&amp; (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getAttribute(&quot;name&quot;).equals(&quot;attr&quot;)&#10; }&#10; .forEach { node -&gt;&#10; if (node == null) return@forEach&#10; val elementName = (node as Element).getElementsByTagName(&quot;f&quot;).item(0).textContent.trim()&#10; val from = node.getAttribute(&quot;from&quot;)&#10; val attributes = (node.getElementsByTagName(&quot;f&quot;).item(1) as Element).getElementsByTagName(&quot;f&quot;)&#10; val res = StringBuilder()&#10; IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -&gt;&#10; val attrName = &quot;$elementName/${(attr as Element).getAttribute(&quot;name&quot;)}&quot;&#10; if (attrName.matches(Regex(extractAttributesRegex))) {&#10; res.append(&quot;# $attrName = ${attr.textContent}\n&quot;)&#10; //LOGGER.info(&quot;&quot; + from + &quot;: $attrName = &quot; + attr.textContent)&#10; }&#10;&#10; }&#10; if (res.isNotEmpty()) {&#10; if (miscLocal.containsKey(from)) {&#10; // LOGGER.info(&quot;ADDING TO $from: ${miscLocal[from]}&quot;)&#10; miscLocal[from] += res.toString()&#10; } else {&#10; miscLocal[from] = res.toString()&#10; }&#10; }&#10; }&#10; return miscLocal&#10; }&#10;&#10;&#10; class Span(var from: Int, var to: Int)&#10;&#10; class MorphoSpan(&#10; var lemma: String? = &quot;_&quot;,&#10; var upos: String? = &quot;_&quot;,&#10; var xpos: String? = &quot;_&quot;,&#10; var feats: String? = &quot;_&quot;,&#10; var head: String? = &quot;_&quot;,&#10; var deprel: String? = &quot;_&quot;,&#10; var deps: String? = &quot;_&quot;,&#10; var misc: String? = &quot;_&quot;&#10; )&#10;&#10;}&#10;&#10;fun main(args: Array&lt;String&gt;): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args))&#10;&#10;fun debug(args: Array&lt;String&gt;): Int {&#10; return (CommandLine(KorapXmlTool()).execute(*args))&#10;}&#10;&#10;enum class OutputFormat {&#10; CONLLU, WORD2VEC, KORAPXML, NOW&#10;}&#10;&#10;object ConlluOutputFormat {&#10; const val NAME = &quot;conllu&quot;&#10;}&#10;&#10;object Word2VecOutputFormat {&#10; const val NAME = &quot;word2vec&quot;&#10;}&#10;&#10;object KorapXmlOutputFormat {&#10; const val NAME = &quot;korapxml&quot;&#10;}&#10;&#10;object NowOutputFormat {&#10; const val NAME = &quot;now&quot;&#10;}" />
</PendingDiffInfo>
</value>
</entry>
<entry key="$PROJECT_DIR$/build.gradle">
<value>
<PendingDiffInfo>
<option name="filePath" value="$PROJECT_DIR$/build.gradle" />
<option name="originalContent" value="repositories {&#10; flatDir {&#10; dirs(&quot;libs&quot;)&#10; }&#10;}&#10;" />
<option name="updatedContent" value="repositories {&#10; flatDir {&#10; dirs(&quot;libs&quot;)&#10; }&#10;}&#10;&#10;// Zentrale Projektversion für korapxmltool&#10;version = '2.0-beta-02'" />
</PendingDiffInfo>
</value>
</entry>
</map>
</option>
</component>
</project>