Bump version to 2.01
Change-Id: If53a51815e42ddcac3d58e87e4d1ef1fbf09dbe1
diff --git a/.idea/copilotDiffState.xml b/.idea/copilotDiffState.xml
deleted file mode 100644
index f504029..0000000
--- a/.idea/copilotDiffState.xml
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
- <component name="CopilotDiffPersistence">
- <option name="pendingDiffs">
- <map>
- <entry key="$PROJECT_DIR$/app/build.gradle">
- <value>
- <PendingDiffInfo>
- <option name="filePath" value="$PROJECT_DIR$/app/build.gradle" />
- <option name="originalContent" value="plugins { // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin. id 'org.jetbrains.kotlin.jvm' version '2.2.21' // Apply the application plugin to add support for building a CLI application in Java. id 'application' id 'com.github.johnrengelman.shadow' version '8.1.1' } repositories { mavenCentral() maven { url 'https://jitpack.io' } } test { minHeapSize = "512m" maxHeapSize = "4096m" jvmArgs '-XX:MaxMetaspaceSize=1024m' } dependencies { // Align versions of all Kotlin components implementation platform('org.jetbrains.kotlin:kotlin-bom') // Use the Kotlin JDK 8 standard library. implementation 'org.jetbrains.kotlin:kotlin-stdlib' implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2' // This dependency is used by the application. implementation 'com.google.guava:guava:33.5.0-jre' implementation ("info.picocli:picocli:4.7.7") // Use the Kotlin test library. testImplementation 'org.jetbrains.kotlin:kotlin-test' // Use the Kotlin JUnit integration. testImplementation 'org.jetbrains.kotlin:kotlin-test-junit' testImplementation "org.jetbrains.kotlin:kotlin-test:2.2.21" implementation 'com.github.kupietz:cistern:v1.0.4' implementation 'org.maltparser:maltparser:1.9.2' implementation 'org.apache.opennlp:opennlp-tools:2.5.6' implementation 'org.slf4j:slf4j-simple:2.0.17' implementation 'org.apache.ant:ant:1.10.15' implementation 'org.apache.commons:commons-compress:1.28.0' } // Erzwinge JDK 21 Toolchain und Bytecode-Level 21 java { toolchain { languageVersion = JavaLanguageVersion.of(21) } } kotlin { jvmToolchain(21) } // Für evtl. vorhandenen Java-Quellcode tasks.withType(JavaCompile).configureEach { options.release = 21 } tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach { kotlinOptions { jvmTarget = "21" // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release // freeCompilerArgs += ["-Xjdk-release=21"] } } application { // Define the main class for the application. mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt' } jar { // Will include every single one of your dependencies, project or not // def lowerCasedName = baseName.toLowerCase() // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1) manifest.attributes( 'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '), 'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt" ) shadowJar { archiveBaseName.set('korapxmltool') archiveClassifier.set('') archiveVersion.set('') } } configurations { runtimeLib.extendsFrom implementation } " />
- <option name="updatedContent" value="plugins { // Apply the org.jetbrains.kotlin.jvm Plugin to add support for Kotlin. id 'org.jetbrains.kotlin.jvm' version '2.2.21' // Apply the application plugin to add support for building a CLI application in Java. id 'application' id 'com.github.johnrengelman.shadow' version '8.1.1' } repositories { mavenCentral() maven { url 'https://jitpack.io' } } test { minHeapSize = "512m" maxHeapSize = "4096m" jvmArgs '-XX:MaxMetaspaceSize=1024m' } dependencies { // Align versions of all Kotlin components implementation platform('org.jetbrains.kotlin:kotlin-bom') // Use the Kotlin JDK 8 standard library. implementation 'org.jetbrains.kotlin:kotlin-stdlib' implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.10.2' // This dependency is used by the application. implementation 'com.google.guava:guava:33.5.0-jre' implementation ("info.picocli:picocli:4.7.7") // Use the Kotlin test library. testImplementation 'org.jetbrains.kotlin:kotlin-test' // Use the Kotlin JUnit integration. testImplementation 'org.jetbrains.kotlin:kotlin-test-junit' testImplementation "org.jetbrains.kotlin:kotlin-test:2.2.21" implementation 'com.github.kupietz:cistern:v1.0.4' implementation 'org.maltparser:maltparser:1.9.2' implementation 'org.apache.opennlp:opennlp-tools:2.5.6' implementation 'org.slf4j:slf4j-simple:2.0.17' implementation 'org.apache.ant:ant:1.10.15' implementation 'org.apache.commons:commons-compress:1.28.0' } // Erzwinge JDK 21 Toolchain und Bytecode-Level 21 java { toolchain { languageVersion = JavaLanguageVersion.of(21) } } kotlin { jvmToolchain(21) } // Für evtl. vorhandenen Java-Quellcode tasks.withType(JavaCompile).configureEach { options.release = 21 } tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).configureEach { kotlinOptions { jvmTarget = "21" // Falls verfügbar, sorgt dies für konsistente API-Targets ähnlich zu Java --release // freeCompilerArgs += ["-Xjdk-release=21"] } } application { // Define the main class for the application. mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt' } jar { // Will include every single one of your dependencies, project or not // def lowerCasedName = baseName.toLowerCase() // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1) manifest.attributes( 'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '), 'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt", 'Implementation-Title': rootProject.name, 'Implementation-Version': project.version ) shadowJar { archiveBaseName.set('korapxmltool') archiveClassifier.set('') // Version ins Dateinamen aufnehmen archiveVersion.set(project.version.toString()) manifest.attributes( 'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt", 'Implementation-Title': rootProject.name, 'Implementation-Version': project.version ) } } configurations { runtimeLib.extendsFrom implementation }" />
- </PendingDiffInfo>
- </value>
- </entry>
- <entry key="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt">
- <value>
- <PendingDiffInfo>
- <option name="filePath" value="$PROJECT_DIR$/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt" />
- <option name="originalContent" value="package de.ids_mannheim.korapxmltools import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries import org.apache.commons.compress.archivers.zip.Zip64Mode import org.apache.commons.compress.archivers.zip.ZipArchiveEntry import org.w3c.dom.Document import org.w3c.dom.Element import org.w3c.dom.NodeList import org.xml.sax.InputSource import org.xml.sax.SAXParseException import picocli.CommandLine import picocli.CommandLine.* import java.io.File import java.io.FileOutputStream import java.io.InputStream import java.io.StringWriter import java.lang.Integer.parseInt import java.util.* import java.util.concurrent.Callable import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicLong import java.util.logging.ConsoleHandler import java.util.logging.Level import java.util.logging.LogManager import java.util.logging.Logger import java.util.regex.Matcher import java.util.regex.Pattern import java.util.stream.IntStream import java.util.zip.ZipEntry import java.util.zip.ZipFile import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream import javax.xml.parsers.DocumentBuilder import javax.xml.parsers.DocumentBuilderFactory import javax.xml.transform.OutputKeys import javax.xml.transform.TransformerFactory import javax.xml.transform.dom.DOMSource import javax.xml.transform.stream.StreamResult import kotlin.math.min import kotlin.system.exitProcess val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8) @Command( name = "KorapXmlTool", mixinStandardHelpOptions = true, version = ["KorapXmlTool 2.0-beta-01"], description = ["Converts KorAP-XML <https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or " + "morpho zips to (annotated) CoNLL(-U) format with all information necessary for " + "reconstruction in comment lines."] ) class KorapXmlTool : Callable<Int> { val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null @Spec lateinit var spec : Model.CommandSpec @Parameters(arity = "1..*", description = ["At least one zip file name"]) var zipFileNames: Array<String>? = null @Option( names = ["-f", "--output-format"], description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}", "conllu: CoNLL-U format", "korapxml, xml, zip: KorAP-XML format zip", "word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines", "now, NOW: NOW corpus export format: w2v-like format with <p> tags for sentence ends and @@<text-sigle> prefix", ], converter = [OutputFormatConverter::class] ) var outputFormat: OutputFormat = OutputFormat.CONLLU class OutputFormatConverter : ITypeConverter<OutputFormat> { override fun convert(value: String?): OutputFormat { return when (value?.lowercase(Locale.getDefault())) { "conllu", "conll" -> OutputFormat.CONLLU "word2vec", "w2v" -> OutputFormat.WORD2VEC "korapxml", "korap", "xml", "zip" -> OutputFormat.KORAPXML "now", "NOW" -> OutputFormat.NOW else -> throw IllegalArgumentException("Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(", ") { it.name }}") } } } @Option( names = ["--sigle-pattern", "-p"], paramLabel = "PATTERN", description = ["Extract only documents with sigle matching the pattern (regex)"] ) var siglePattern: String? = null @Option( names = ["--extract-attributes-regex", "-e"], paramLabel = "REGEX", description = ["Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.", "Example: -e '(posting/id|div/id)'"] ) var extractAttributesRegex: String = "" @Option( names = ["--s-bounds-from-morpho"], description = ["Not yet implemented: s bounds from morpho"] ) var sBoundsFromMorpho: Boolean = false @Option( names = ["--log", "-l"], paramLabel = "LEVEL", description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"] ) var logLevel: String = "WARNING" @Option( names = ["--columns", "-c"], paramLabel = "NUMBER", description = ["Number of columns. 1 means just the token. Default: ${"$"}{DEFAULT-VALUE}", "Possible values: 1-10"] ) var columns: Int = 10 @Option( names = ["--word2vec", "-w"], description = ["Print text in LM training format: tokens separated by space, sentences separated by newline", "Deprecated: use -f word2vec"] ) fun setWord2Vec(word2vec: Boolean) { if (word2vec) { outputFormat = OutputFormat.WORD2VEC } } @Option( names = ["--exclude-zip-glob"], paramLabel = "GLOB", description = [ "Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').", "May be repeated. Applied to basenames, not full paths." ] ) var excludeZipGlobs: MutableList<String> = mutableListOf() @Option( names = ["--token-separator", "-s"], paramLabel = "STRING", defaultValue = "\n", description = ["Token separator. Default: new-line for CoNLL-U, space for word2vec format."] ) var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) " " else "\n" @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"]) var offsets: Boolean = false @Option(names = ["--comments", "-C"], description = ["Not yet implemented: comments"]) var comments: Boolean = false @Option( names = ["--extract-metadata-regex", "-m"], paramLabel = "REGEX", description = ["Extract metadata regexes.\nExample: -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)'"] ) var extractMetadataRegex: MutableList<String> = mutableListOf() @Option( names = ["--annotate-with", "-A"], paramLabel = "COMMAND", description = ["Pipe output through command"] ) var annotateWith: String = "" @Option( names = ["--threads", "-T"], paramLabel = "THREADS", description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"] ) var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2 fun setThreads(threads: Int) { if (threads < 1) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%d' for option '--threads': must be at least 1", threads)) } this.maxThreads = threads System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", threads.toString()) } @Option( names = ["--zip-parallelism"], paramLabel = "N", description = ["Maximum number of zip files to process concurrently. Defaults to --threads."] ) var zipParallelism: Int? = null @Option( names = ["--sequential"], description = [ "Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now)." ] ) var sequentialInZip: Boolean = false @Option( names = ["--overwrite", "-o"], description = ["Overwrite existing files"] ) var overwrite: Boolean = false @Option( names = ["--mem-stats-interval"], paramLabel = "N", description = ["Log memory and cache statistics every N processed documents (0 disables; default: 0)"] ) var memStatsInterval: Int = 0 @Option( names = ["--lemma"], description = ["In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)"] ) var useLemma: Boolean = false @Option( names = ["--lemma-only"], description = [ "Do not load texts from data.xml and output only lemmas (requires morpho.xml).", "Only valid with -f word2vec or -f now; implies --lemma." ] ) var lemmaOnly: Boolean = false private var taggerName: String? = null private var taggerModel: String? = null @Option( names = ["--tag-with", "-t"], paramLabel = "TAGGER:MODEL", description = ["Specify a tagger and a model: ${taggerFoundries}:<path/to/model>."] ) fun setTagWith(tagWith: String) { val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)") val matcher: Matcher = pattern.matcher(tagWith) if (!matcher.matches()) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%s' for option '--tag-with': "+ "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith)) } else { taggerName = matcher.group(1) taggerModel = matcher.group(2) if (!File(taggerModel).exists()) { throw ParameterException(spec.commandLine(), String.format("Invalid value for option '--tag-with':"+ "model file '%s' does not exist", taggerModel, taggerModel)) } } } private var parserName: String? = null private var parserModel: String? = null @Option( names = ["--parse-with", "-P"], paramLabel = "parser:MODEL", description = ["Specify a parser and a model: ${parserFoundries}:<path/to/model>."] ) fun setParseWith(parseWith: String) { val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)") val matcher: Matcher = pattern.matcher(parseWith) if (!matcher.matches()) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%s' for option '--parse-with': "+ "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith)) } else { parserName = matcher.group(1) parserModel = matcher.group(2) if (!File(parserModel).exists()) { throw ParameterException(spec.commandLine(), String.format("Invalid value for option '--parse-with':"+ "model file '%s' does not exist", parserModel, parserModel)) } } } override fun call(): Int { val handler = ConsoleHandler() LogManager.getLogManager().reset() handler.formatter = ColoredFormatter() for (handler in LOGGER.handlers) { LOGGER.removeHandler(handler) } LOGGER.addHandler(handler) LOGGER.level = try { Level.parse(logLevel.uppercase(Locale.getDefault())) } catch (e: IllegalArgumentException) { LOGGER.warning("Invalid log level: $logLevel. Defaulting to WARNING.") Level.WARNING } if (lemmaOnly) { useLemma = true if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) { throw ParameterException(spec.commandLine(), "--lemma-only is supported only with -f word2vec or -f now") } } LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", ")) korapxml2conllu(zipFileNames!!) return 0 } private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name) private var annotationWorkerPool : AnnotationWorkerPool? = null // Shared executor for entry-level parallelism across all zips private var entryExecutor: java.util.concurrent.ExecutorService? = null val texts: ConcurrentHashMap<String, NonBmpString> = ConcurrentHashMap() val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap() val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap() val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap() val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap() val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap() val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap() private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0) var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap() var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap() // Zip progress tracking for logging (zipNumber/zipTotal) private val zipOrdinals: ConcurrentHashMap<String, Int> = ConcurrentHashMap() private var totalZips: Int = 0 private val zipSizes: ConcurrentHashMap<String, Long> = ConcurrentHashMap() private val processedZipBytes: AtomicLong = AtomicLong(0) private var totalZipBytes: Long = 0 private var startTimeMillis: Long = 0 var dbFactory: DocumentBuilderFactory? = null var dBuilder: DocumentBuilder? = null var morphoZipOutputStream: ZipArchiveOutputStream? = null fun String.hasCorrespondingBaseZip(): Boolean { if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip") return File(baseZip).exists() } fun String.correspondingBaseZip(): String? { if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return null val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip") return if (File(baseZip).exists()) baseZip else null } fun korapxml2conllu(args: Array<String>) { if (outputFormat == OutputFormat.KORAPXML && annotateWith.isNotEmpty()) { LOGGER.severe("Shell command annotation is not yet supported with output format $outputFormat") exitProcess(1) } // Initialize shared entry executor (used inside each zip) entryExecutor = Executors.newFixedThreadPool(maxThreads) if (annotateWith.isNotEmpty()) { annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER) } var zips: Array<String> = args if (excludeZipGlobs.isNotEmpty()) { val before = zips.size val patterns = excludeZipGlobs.map { globToRegex(it) } zips = zips.filter { zipPath -> val base = File(zipPath).name patterns.none { rx -> rx.matches(base) } }.toTypedArray() val excluded = before - zips.size if (excluded > 0) { LOGGER.info("Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(", ")}") } } // Initialize zip progress tracking and sizes startTimeMillis = System.currentTimeMillis() processedZipBytes.set(0) totalZips = zips.size zipOrdinals.clear() zipSizes.clear() zips.forEach { zip -> zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } } totalZipBytes = zipSizes.values.sum() // In lemma-only mode, process largest zips first if (lemmaOnly) { zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray() } zips.forEachIndexed { index, zip -> zipOrdinals[zip] = index + 1 } // Log zip order with sizes so the user can verify sorting val totalHuman = humanBytes(totalZipBytes) LOGGER.info("Zip processing order (${zips.size} file(s), total ${totalHuman}):") zips.forEachIndexed { idx, zip -> val size = zipSizes[zip] ?: 0L LOGGER.info(String.format(Locale.ROOT, "%d/%d: %s (%s)", idx + 1, zips.size, zip, humanBytes(size))) } if (sequentialInZip) { if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) { throw ParameterException(spec.commandLine(), "--sequential is supported only with -f word2vec or -f now") } } if (maxThreads > 1) { val foundry = getFoundryFromZipFileNames(zips) val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1) LOGGER.info("Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) "sequential" else "parallel"}") processZipsWithQueue(zips, foundry, parallelism) } else { LOGGER.info("Processing zip files sequentially") Arrays.stream(zips).forEachOrdered { zipFilePath -> processZipFileSequentially((zipFilePath ?: "").toString(), getFoundryFromZipFileNames(zips)) } } if (annotationWorkerPool != null) { LOGGER.info("closing worker pool") annotationWorkerPool?.close() } // Shutdown entry executor entryExecutor?.shutdown() } private fun processZipsWithQueue(zips: Array<String>, foundry: String, parallelism: Int) { val queue: java.util.concurrent.BlockingQueue<String> = java.util.concurrent.LinkedBlockingQueue() zips.forEach { queue.put(it) } val executor = Executors.newFixedThreadPool(parallelism) val active = java.util.concurrent.atomic.AtomicInteger(0) repeat(parallelism) { executor.submit { active.incrementAndGet() try { while (true) { val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS) if (zipPath == null) { if (queue.isEmpty()) break else continue } if (sequentialInZip) { processZipFileSequentially(zipPath, foundry) } else { processZipFile(zipPath, foundry) } } } finally { active.decrementAndGet() } } } executor.shutdown() try { executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS) } catch (ie: InterruptedException) { Thread.currentThread().interrupt() } } // Convert a shell-like glob to a Regex: '*' -> ".*", '?' -> '.', anchored full match private fun globToRegex(glob: String): Regex { val sb = StringBuilder("^") glob.forEach { ch -> when (ch) { '*' -> sb.append(".*") '?' -> sb.append('.') '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -> sb.append('\\').append(ch) else -> sb.append(ch) } } sb.append('$') return Regex(sb.toString()) } private fun getTokenSpansFromMorho(morpho: MutableMap<String, MorphoSpan>): Array<Span> { return morpho.keys.map { key -> val fromTo = key.split("-") Span(fromTo[0].toInt(), fromTo[1].toInt()) }.sortedBy { it.from }.toTypedArray() } private fun getFoundryFromZipFileName(zipFileName: String): String { if (!zipFileName.matches(Regex(".*\\.([^/.]+)\\.zip$"))) { return "base" } return zipFileName.replace(Regex(".*\\.([^/.]+)\\.zip$"), "$1") } private fun getFoundryFromZipFileNames(zipFileNames: Array<String>): String { for (zipFileName in zipFileNames) { val foundry = getFoundryFromZipFileName(zipFileName) if (foundry != "base") { return foundry } } return "base" } private fun processZipFile(zipFilePath: String, foundry: String = "base") { val ord = zipOrdinals[zipFilePath] ?: 0 val size = zipSizes[zipFilePath] ?: 0L LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}") LOGGER.info("Foundry: $foundry $dbFactory") if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) { var targetFoundry = "base" if (taggerName != null) { val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge? if (tagger != null) { targetFoundry = tagger.foundry } } else if (parserName != null) { targetFoundry = parserName!! } dbFactory = DocumentBuilderFactory.newInstance() dBuilder = dbFactory!!.newDocumentBuilder() val outputMorphoZipFileName = if (parserName != null) zipFilePath.replace(Regex("(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$"), ".".plus(parserName).plus(".zip")) else zipFilePath.replace(Regex("\\.zip$"), ".".plus(targetFoundry).plus(".zip")) if (File(outputMorphoZipFileName).exists() && !overwrite) { LOGGER.severe("Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.") exitProcess(1) } val fileOutputStream = FileOutputStream(outputMorphoZipFileName) morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply { setUseZip64(Zip64Mode.Always) } } if (zipFilePath.hasCorrespondingBaseZip()) { val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!) // Process related zips one after another to keep the ZipFile lifetime strictly bounded relatedZips.forEach { zip -> ZipFile(zip).use { zipFile -> processZipEntriesWithPool(zipFile, foundry, true) } } } else { ZipFile(zipFilePath).use { zipFile -> processZipEntriesWithPool(zipFile, foundry, false) } } if (outputFormat == OutputFormat.KORAPXML) { morphoZipOutputStream!!.close() } logZipProgress(zipFilePath) } private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") { val ord = zipOrdinals[zipFilePath] ?: 0 val size = zipSizes[zipFilePath] ?: 0L LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}") if (zipFilePath.hasCorrespondingBaseZip()) { // Process the two related zips strictly sequentially to limit memory growth val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!) zips.forEach { zip -> ZipFile(zip).use { zipFile -> // Iterate entries in a deterministic order to keep related files close together zipFile.stream() .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") } .sorted(Comparator.comparing<ZipEntry, String> { it.name }) .forEachOrdered { zipEntry -> processZipEntry(zipFile, foundry, zipEntry, true) } } } } else { ZipFile(zipFilePath).use { zipFile -> zipFile.stream() .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") } .sorted(Comparator.comparing<ZipEntry, String> { it.name }) .forEachOrdered { zipEntry -> processZipEntry(zipFile, foundry, zipEntry, false) } } } logZipProgress(zipFilePath) } private fun logZipProgress(zipFilePath: String) { try { val size = zipSizes[zipFilePath] ?: 0L val done = processedZipBytes.addAndGet(size) val total = if (totalZipBytes > 0) totalZipBytes else 1L val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1) val speedBytesPerSec = (done * 1000.0) / elapsedMs val remaining = (total - done).coerceAtLeast(0) val etaSeconds = if (speedBytesPerSec > 0.0) (remaining / speedBytesPerSec).toLong() else -1L val ord = zipOrdinals[zipFilePath] ?: 0 val pct = (done * 100.0 / total).coerceIn(0.0, 100.0) val humanSpeed = String.format(Locale.ROOT, "%.2f MB/s", speedBytesPerSec / (1024.0 * 1024.0)) val etaStr = if (etaSeconds >= 0) formatDuration(etaSeconds) else "unknown" LOGGER.info( "Finished zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} " + "(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, "%.1f", pct)}%%, " + "ETA ${etaStr} at ${humanSpeed}" ) } catch (e: Exception) { LOGGER.fine("Failed to log zip progress for $zipFilePath: ${e.message}") } } private fun humanBytes(bytes: Long): String { if (bytes < 1024) return "$bytes B" val kb = bytes / 1024.0 if (kb < 1024) return String.format(Locale.ROOT, "%.1f KB", kb) val mb = kb / 1024.0 if (mb < 1024) return String.format(Locale.ROOT, "%.1f MB", mb) val gb = mb / 1024.0 return String.format(Locale.ROOT, "%.1f GB", gb) } private fun formatDuration(seconds: Long): String { var s = seconds val h = s / 3600; s %= 3600 val m = s / 60; val sec = s % 60 return String.format(Locale.ROOT, "%02d:%02d:%02d", h, m, sec) } private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) { // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested val entries: MutableList<ZipEntry> = ArrayList() val enumEntries = zipFile.entries() while (enumEntries.hasMoreElements()) { val e = enumEntries.nextElement() if (extractMetadataRegex.isEmpty() && e.name.contains("header.xml")) continue entries.add(e) } if (entries.isEmpty()) return // If only one thread requested, do sequential to avoid pool overhead if (maxThreads <= 1) { entries.forEach { entry -> processZipEntry(zipFile, foundry, entry, waitForMorpho) } return } // Submit all entry tasks to the shared executor and await completion before closing the zip val latch = java.util.concurrent.CountDownLatch(entries.size) entries.forEach { entry -> entryExecutor?.execute { try { processZipEntry(zipFile, foundry, entry, waitForMorpho) } catch (t: Throwable) { LOGGER.warning("Failed to process entry ${entry.name}: ${t.message}") } finally { latch.countDown() } } } try { latch.await() } catch (ie: InterruptedException) { Thread.currentThread().interrupt() } } fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) { var foundry = _foundry var waitForMorpho = passedWaitForMorpho LOGGER.finer("Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}") if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().threadId())) { val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge? taggerToolBridges[Thread.currentThread().threadId()] = tagger if (tagger != null) { foundry = tagger.foundry } } if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().threadId())) { val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge? parserToolBridges[Thread.currentThread().threadId()] = parser if (parser != null) { foundry = "$foundry dependency:${parser.foundry}" LOGGER.fine("Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}") } } try { if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) { // Ensure the entry stream and reader are closed to avoid native memory buildup val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance() val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder() // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure if (lemmaOnly && zipEntry.name.endsWith("data.xml")) { return } val doc: Document = try { zipFile.getInputStream(zipEntry).use { inputStream -> XMLCommentFilterReader(inputStream, "UTF-8").use { reader -> dBuilder.parse(InputSource(reader)) } } } catch (e: SAXParseException) { LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message) return } doc.documentElement.normalize() val docId: String = doc.documentElement.getAttribute("docid") if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) { return } // LOGGER.info("Processing file: " + zipEntry.getName()) val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1") when (fileName) { "data.xml" -> { if (!lemmaOnly) { val textsList: NodeList = doc.getElementsByTagName("text") if (textsList.length > 0) { texts[docId] = NonBmpString(textsList.item(0).textContent) } } } "structure.xml" -> { val spans: NodeList = doc.getElementsByTagName("span") if (extractAttributesRegex.isNotEmpty()) extraFeatures[docId] = extractMiscSpans(spans) sentences[docId] = extractSentenceSpans(spans) } "tokens.xml" -> { if (!fnames.contains(docId)) { fnames[docId] = zipEntry.name } val tokenSpans: NodeList = doc.getElementsByTagName("span") tokens[docId] = extractSpans(tokenSpans) } "morpho.xml" -> { waitForMorpho = true fnames[docId] = zipEntry.name val fsSpans: NodeList = doc.getElementsByTagName("span") morpho[docId] = extractMorphoSpans(fsSpans) tokens[docId] = extractSpans(fsSpans) } } val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML // For lemma-only/lemma-based word2vec/now, we can proceed without full text val textRequired = when (outputFormat) { OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly) else -> true } if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null && (!morphoRequired || morpho[docId] != null) && (extractMetadataRegex.isEmpty() || metadata[docId] != null) ) { // Be quiet on INFO; per-text logs only on FINE and below LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().threadId()}") processText(docId, foundry) } } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) { //LOGGER.info("Processing header file: " + zipEntry.name) val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() } val docId = Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1() ?.replace(Regex("/"), "_") LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId) val meta = ArrayList<String>() extractMetadataRegex.forEach { regex -> val match = Regex(regex).find(text) if (match != null) { meta.add(match.destructured.component1()) } } if (meta.isNotEmpty() && docId != null) { metadata[docId] = meta.toTypedArray() val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML val textRequired = when (outputFormat) { OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly) else -> true } if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null && (!morphoRequired || morpho[docId] != null) ) { // Be quiet on INFO; per-text logs only on FINE and below LOGGER.fine("Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}") processText(docId, foundry) } } } } catch (e: Exception) { e.printStackTrace() } } private fun processText( docId: String, foundry: String, ) { LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().threadId()}") var morphoFoundry = getMorphoFoundry() val output = if (outputFormat == OutputFormat.WORD2VEC) { lmTrainingOutput(docId) } else if (outputFormat == OutputFormat.NOW) { nowOutput(docId) } else { if (taggerToolBridges[Thread.currentThread().threadId()] != null) { morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText( tokens[docId]!!, sentences[docId], texts[docId]!! ) } if (parserToolBridges[Thread.currentThread().threadId()] != null) { if (morpho[docId] == null) { LOGGER.severe("No morpho data for $docId") //exitProcess(1) } LOGGER.finer("Parsing text: $docId in thread ${Thread.currentThread().threadId()}") morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText( tokens[docId]!!, morpho[docId], sentences[docId], texts[docId]!! ) LOGGER.finer("Parsed text: $docId in thread ${Thread.currentThread().threadId()}") } if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) { korapXmlOutput(getMorphoFoundry(), docId) } else { conlluOutput(foundry, docId) } } if (annotationWorkerPool != null) { annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString()) // Release internal char[] early output.setLength(0) } else if (outputFormat != OutputFormat.KORAPXML) { synchronized(System.out) { println(output.toString()) } // Release internal char[] early output.setLength(0) } else { korapXmlOutput(foundry, docId) } arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -> if (map === morpho) { // Clear inner map to release references early morpho[docId]?.clear() } map.remove(docId) } // Periodic GC hint after processing many docs (lightweight safeguard) if ((processedDocs.incrementAndGet() % 2000) == 0) { LOGGER.fine("Processed ${processedDocs.get()} docs – requesting GC hint") System.gc() } // Memory / cache statistics logging if (memStatsInterval > 0) { val count = processedDocs.get() if (count % memStatsInterval == 0) { logMemoryStats(count) } } if (outputFormat == OutputFormat.KORAPXML) { val entryPath = if (parserName != null) docId.replace(Regex("[_.]"), "/").plus("/$parserName/").plus("dependency.xml") else docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml") val zipEntry = ZipArchiveEntry(entryPath) zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE synchronized(morphoZipOutputStream!!) { morphoZipOutputStream!!.putArchiveEntry(zipEntry) morphoZipOutputStream!!.write(output.toString().toByteArray()) morphoZipOutputStream!!.closeArchiveEntry() } output.clear() } } private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: "base" private fun logMemoryStats(count: Int) { try { val rt = Runtime.getRuntime() val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024) val total = rt.totalMemory() / (1024 * 1024) val max = rt.maxMemory() / (1024 * 1024) LOGGER.info( "MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} " + "maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}" ) } catch (e: Exception) { LOGGER.warning("Failed to log memory stats: ${e.message}") } } private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder { val doc: Document = dBuilder!!.newDocument() // Root element val layer = doc.createElement("layer") layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP") layer.setAttribute("version", "KorAP-0.4") layer.setAttribute("docid", docId) doc.appendChild(layer) val spanList = doc.createElement("spanList") layer.appendChild(spanList) var i = 0 var s = 0 var n = 0 val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split("-")[0].toInt() } sortedKeys?.forEach { spanString -> val mfs = morpho[docId]?.get(spanString) val offsets = spanString.split("-") if(offsets.size != 2) { LOGGER.warning("Invalid span: $spanString in $docId") return@forEach } if (offsets[0].toInt() > sentences[docId]!!.elementAt(s).to) { s++ n = i } i++ if (mfs!!.deprel == "_") { return@forEach } val spanNode = doc.createElement("span") spanNode.setAttribute("id", "s${s + 1}_n${i - n}") spanNode.setAttribute("from", offsets[0]) spanNode.setAttribute("to", offsets[1]) // rel element val rel = doc.createElement("rel") rel.setAttribute("label", mfs.deprel) // inner span element val innerSpan = doc.createElement("span") val headInt = if(mfs.head == "_") 0 else parseInt(mfs.head) - 1 if (headInt < 0) { innerSpan.setAttribute("from", sentences[docId]!!.elementAt(s).from.toString()) innerSpan.setAttribute("to", sentences[docId]!!.elementAt(s).to.toString()) } else { if (headInt + n >= morpho[docId]!!.size) { LOGGER.warning("Head index out of bounds: ${headInt+n} >= ${morpho[docId]!!.size} in $docId") return@forEach } else { val destSpanString = sortedKeys.elementAt(headInt + n) val destOffsets = destSpanString.split("-") innerSpan.setAttribute("from", destOffsets[0]) innerSpan.setAttribute("to", destOffsets[1]) } } rel.appendChild(innerSpan) spanNode.appendChild(rel) spanList.appendChild(spanNode) } val transformerFactory = TransformerFactory.newInstance() val transformer = transformerFactory.newTransformer() transformer.setOutputProperty(OutputKeys.INDENT, "yes") transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no") transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1") val domSource = DOMSource(doc) val streamResult = StreamResult(StringWriter()) transformer.transform(domSource, streamResult) return StringBuilder(streamResult.writer.toString()) } private fun korapXmlOutput(foundry: String, docId: String): StringBuilder { return if (parserName != null) { korapXmlDependencyOutput(foundry, docId) } else { korapXmlMorphoOutput(foundry, docId) } } private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder { val doc: Document = dBuilder!!.newDocument() // Root element val layer = doc.createElement("layer") layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP") layer.setAttribute("version", "KorAP-0.4") layer.setAttribute("docid", docId) doc.appendChild(layer) val spanList = doc.createElement("spanList") layer.appendChild(spanList) var i = 0 morpho[docId]?.forEach { (spanString, mfs) -> i++ val offsets = spanString.split("-") val spanNode = doc.createElement("span") spanNode.setAttribute("id", "t_$i") spanNode.setAttribute("from", offsets[0]) spanNode.setAttribute("to", offsets[1]) // fs element val fs = doc.createElement("fs") fs.setAttribute("type", "lex") fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0") spanNode.appendChild(fs) val f = doc.createElement("f") f.setAttribute("name", "lex") fs.appendChild(f) // Inner fs element val innerFs = doc.createElement("fs") f.appendChild(innerFs) if (mfs.lemma != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "lemma") innerF.textContent = mfs.lemma innerFs.appendChild(innerF) } if (mfs.upos != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "upos") innerF.textContent = mfs.upos innerFs.appendChild(innerF) } if (mfs.xpos != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "pos") innerF.textContent = mfs.xpos innerFs.appendChild(innerF) } if (mfs.feats != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "msd") innerF.textContent = mfs.feats innerFs.appendChild(innerF) } if (mfs.misc != "_" && mfs.misc!!.matches(Regex("^[0-9.]+$"))) { val innerF = doc.createElement("f") innerF.setAttribute("name", "certainty") innerF.textContent = mfs.misc innerFs.appendChild(innerF) } spanList.appendChild(spanNode) } val transformerFactory = TransformerFactory.newInstance() val transformer = transformerFactory.newTransformer() transformer.setOutputProperty(OutputKeys.INDENT, "yes") transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no") transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1") val domSource = DOMSource(doc) val streamResult = StreamResult(StringWriter()) transformer.transform(domSource, streamResult) return StringBuilder(streamResult.writer.toString()) } private fun conlluOutput(foundry: String, docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder output = StringBuilder("# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n").append( tokenOffsetsInSentence( sentences, docId, sentence_index, real_token_index, tokens ) ) if (extractMetadataRegex.isNotEmpty()) { output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "") } var previousSpanStart = 0 tokens[docId]?.forEach { span -> token_index++ if (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to) { output.append("\n") sentence_index++ token_index = 1 output.append( tokenOffsetsInSentence( sentences, docId, sentence_index, real_token_index, tokens ) ) } if (extractAttributesRegex.isNotEmpty() && extraFeatures[docId] != null) { for (i in previousSpanStart until span.from + 1) { if (extraFeatures[docId]?.containsKey("$i") == true) { output.append(extraFeatures[docId]!!["$i"]) extraFeatures[docId]!!.remove("$i") } } previousSpanStart = span.from + 1 } if (morpho[docId]?.containsKey("${span.from}-${span.to}") == true) { val mfs = morpho[docId]!!["${span.from}-${span.to}"] if (span.to > texts[docId]!!.length) { span.to = texts[docId]!!.length LOGGER.warning( "Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${ texts[docId]!!.substring( span.from, span.to ) }" ) } output.append( printConlluToken( token_index, texts[docId]!!.substring(span.from, span.to), mfs!!.lemma!!, mfs.upos!!, mfs.xpos!!, mfs.feats!!, mfs.head!!, mfs.deprel!!, mfs.deps!!, mfs.misc!!, columns ) ) } else { output.append( printConlluToken( token_index, texts[docId]!!.substring(span.from, span.to), columns = columns ) ) } real_token_index++ } return output } private fun lmTrainingOutput(docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder output = StringBuilder() if (extractMetadataRegex.isNotEmpty()) { output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "") } // If no text is available (e.g., lemma-only mode), emit lemmas if (texts[docId] == null) { tokens[docId]?.forEach { span -> val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]?.get(key)?.lemma output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ") } if (output.isNotEmpty()) output.deleteCharAt(output.length - 1) return output } tokens[docId]?.forEach { span -> token_index++ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { if (output.isNotEmpty()) { output.setCharAt(output.length - 1, '\n') } else { output.append("\n") } if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) { output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "") } sentence_index++ } // Bounds safety val safeFrom = span.from.coerceIn(0, texts[docId]!!.length) val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length) if (useLemma && morpho[docId] != null) { val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]!![key]?.lemma if (lemmaVal != null && lemmaVal != "_") { output.append(lemmaVal) output.append(' ') } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } real_token_index++ } if (output.isNotEmpty()) { output.deleteCharAt(output.length - 1) } return output } private fun nowOutput(docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder = StringBuilder() // Add the text sigle prefix output.append("@@$docId ") if (texts[docId] == null) { // Lemma-only fallback when original text is not loaded tokens[docId]?.forEach { span -> if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { if (output.isNotEmpty() && !output.endsWith("@@$docId ")) { output.append(" <p> ") } sentence_index++ } val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]?.get(key)?.lemma output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ") } if (output.isNotEmpty() && output.endsWith(" ")) { output.deleteCharAt(output.length - 1) } return output } tokens[docId]?.forEach { span -> token_index++ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { // Replace sentence end with <p> tag instead of newline if (output.isNotEmpty() && !output.endsWith("@@$docId ")) { output.append(" <p> ") } sentence_index++ } // Bounds safety val safeFrom = span.from.coerceIn(0, texts[docId]!!.length) val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length) if (useLemma && morpho[docId] != null) { val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]!![key]?.lemma if (lemmaVal != null && lemmaVal != "_") { output.append(lemmaVal) output.append(' ') } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } real_token_index++ } // Remove trailing space and add final newline if (output.isNotEmpty() && output.endsWith(" ")) { output.deleteCharAt(output.length - 1) } return output } private fun printConlluToken( token_index: Int, token: String, lemma: String = "_", upos: String = "_", xpos: String = "_", feats: String = "_", head: String = "_", deprel: String = "_", deps: String = "_", misc: String = "_", columns: Int = 10 ): String { val myUpos = if (COMPATIBILITY_MODE && upos == "_") xpos else upos return when (columns) { 1 -> ("$token\n") 10 -> ("$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator") else -> { val fields = listOf( token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc ) fields.subList(0, min(columns, 10)).joinToString("\t", postfix = tokenSeparator) } } } private fun tokenOffsetsInSentence( sentences: ConcurrentHashMap<String, Array<Span>>, docId: String, sentence_index: Int, token_index: Int, tokens: ConcurrentHashMap<String, Array<Span>> ): String { if (sentences[docId] == null || sentences[docId]!!.size <= sentence_index) { return "" } val sentenceEndOffset = sentences[docId]!![sentence_index].to var i = token_index val start_offsets_string = StringBuilder() val end_offsets_string = StringBuilder() while (tokens[docId] != null && i < tokens[docId]!!.size && tokens[docId]!![i].to <= sentenceEndOffset) { start_offsets_string.append(" ", tokens[docId]!![i].from) end_offsets_string.append(" ", tokens[docId]!![i].to) i++ } return ( StringBuilder() .append( "# start_offsets = ", tokens[docId]!![token_index].from, start_offsets_string, "\n", "# end_offsets = ", sentenceEndOffset, end_offsets_string, "\n" ).toString()) } private fun extractSpans(spans: NodeList): Array<Span> { val list = ArrayList<Span>() IntStream.range(0, spans.length).forEach { idx -> val node = spans.item(idx) if (node is Element) { val fromAttr = node.getAttribute("from") val toAttr = node.getAttribute("to") if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) { LOGGER.warning("Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'") } else { try { val from = Integer.parseInt(fromAttr) val to = Integer.parseInt(toAttr) list.add(Span(from, to)) } catch (e: NumberFormatException) { LOGGER.warning("Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}") } } } } return list.toTypedArray() } private fun extractMorphoSpans( fsSpans: NodeList ): MutableMap<String, MorphoSpan> { val UNKNOWN = Regex("(UNKNOWN|<unknown>)") val res: MutableMap<String, MorphoSpan> = HashMap() IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -> node is Element && node.getAttribute("type") != "alt" }.forEach { node -> val features = (node as Element).getElementsByTagName("f") val fs = MorphoSpan() val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}" IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -> val attr = (feature as Element).getAttribute("name") val value = feature.textContent.trim() if (value.isEmpty()) return@forEach when (attr) { "lemma" -> if(fs.lemma == "_") fs.lemma = value.replace(UNKNOWN, "--") "upos" -> fs.upos = value "xpos", "ctag", "pos" -> if(fs.xpos == "_") fs.xpos = value.replace(UNKNOWN, "--") "feats", "msd" -> if(fs.feats == "_" ) fs.feats = value "type" -> if(fs.feats == "_") fs.feats = feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent.trim() // "subtype" -> if(fs.feats == "_") fs.feats += ":" + feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent "certainty" -> if(fs.misc == "_") fs.misc = value } } res[fromTo] = fs } return res } private fun extractSentenceSpans(spans: NodeList): Array<Span> { return IntStream.range(0, spans.length).mapToObj(spans::item) .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") } .map { node -> Span( Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to")) ) }.toArray { size -> arrayOfNulls(size) } } /* <span id="s15" from="370" to="394" l="5"> <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0"> <f name="name">posting</f> <f name="attr"> <fs type="attr"> <f name="id">i.10894_1_3</f> <f name="indentLevel">0</f> <f name="who">WU00000000</f> </fs> </f> </fs> </span> */ private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> { val miscLocal: MutableMap<String, String> = HashMap() IntStream.range(0, spans.length).mapToObj(spans::item) .filter { node -> node is Element && node.getElementsByTagName("f").length > 1 && (node.getElementsByTagName("f").item(0) as Element).getAttribute("name").equals("name") && (node.getElementsByTagName("f").item(1) as Element).getAttribute("name").equals("attr") } .forEach { node -> if (node == null) return@forEach val elementName = (node as Element).getElementsByTagName("f").item(0).textContent.trim() val from = node.getAttribute("from") val attributes = (node.getElementsByTagName("f").item(1) as Element).getElementsByTagName("f") val res = StringBuilder() IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -> val attrName = "$elementName/${(attr as Element).getAttribute("name")}" if (attrName.matches(Regex(extractAttributesRegex))) { res.append("# $attrName = ${attr.textContent}\n") //LOGGER.info("" + from + ": $attrName = " + attr.textContent) } } if (res.isNotEmpty()) { if (miscLocal.containsKey(from)) { // LOGGER.info("ADDING TO $from: ${miscLocal[from]}") miscLocal[from] += res.toString() } else { miscLocal[from] = res.toString() } } } return miscLocal } class Span(var from: Int, var to: Int) class MorphoSpan( var lemma: String? = "_", var upos: String? = "_", var xpos: String? = "_", var feats: String? = "_", var head: String? = "_", var deprel: String? = "_", var deps: String? = "_", var misc: String? = "_" ) } fun main(args: Array<String>): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args)) fun debug(args: Array<String>): Int { return (CommandLine(KorapXmlTool()).execute(*args)) } enum class OutputFormat { CONLLU, WORD2VEC, KORAPXML, NOW } object ConlluOutputFormat { const val NAME = "conllu" } object Word2VecOutputFormat { const val NAME = "word2vec" } object KorapXmlOutputFormat { const val NAME = "korapxml" } object NowOutputFormat { const val NAME = "now" } " />
- <option name="updatedContent" value="package de.ids_mannheim.korapxmltools import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.parserFoundries import de.ids_mannheim.korapxmltools.AnnotationToolBridgeFactory.Companion.taggerFoundries import org.apache.commons.compress.archivers.zip.Zip64Mode import org.apache.commons.compress.archivers.zip.ZipArchiveEntry import org.w3c.dom.Document import org.w3c.dom.Element import org.w3c.dom.NodeList import org.xml.sax.InputSource import org.xml.sax.SAXParseException import picocli.CommandLine import picocli.CommandLine.* import java.io.File import java.io.FileOutputStream import java.io.InputStream import java.io.StringWriter import java.lang.Integer.parseInt import java.util.* import java.util.concurrent.Callable import java.util.concurrent.ConcurrentHashMap import java.util.concurrent.Executors import java.util.concurrent.atomic.AtomicLong import java.util.logging.ConsoleHandler import java.util.logging.Level import java.util.logging.LogManager import java.util.logging.Logger import java.util.regex.Matcher import java.util.regex.Pattern import java.util.stream.IntStream import java.util.zip.ZipEntry import java.util.zip.ZipFile import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream import javax.xml.parsers.DocumentBuilder import javax.xml.parsers.DocumentBuilderFactory import javax.xml.transform.OutputKeys import javax.xml.transform.TransformerFactory import javax.xml.transform.dom.DOMSource import javax.xml.transform.stream.StreamResult import kotlin.math.min import kotlin.system.exitProcess val ZIP_ENTRY_UNIX_MODE = parseInt("644", 8) @Command( name = "KorapXmlTool", mixinStandardHelpOptions = true, version = ["KorapXmlTool 2.0-beta-02"], description = ["Converts KorAP-XML <https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or " + "morpho zips to (annotated) CoNLL(-U) format with all information necessary for " + "reconstruction in comment lines."] ) class KorapXmlTool : Callable<Int> { val COMPATIBILITY_MODE = System.getenv("COMPATIBILITY_MODE") != null @Spec lateinit var spec : Model.CommandSpec @Parameters(arity = "1..*", description = ["At least one zip file name"]) var zipFileNames: Array<String>? = null @Option( names = ["-f", "--output-format"], description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}", "conllu: CoNLL-U format", "korapxml, xml, zip: KorAP-XML format zip", "word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines", "now, NOW: NOW corpus export format: w2v-like format with <p> tags for sentence ends and @@<text-sigle> prefix", ], converter = [OutputFormatConverter::class] ) var outputFormat: OutputFormat = OutputFormat.CONLLU class OutputFormatConverter : ITypeConverter<OutputFormat> { override fun convert(value: String?): OutputFormat { return when (value?.lowercase(Locale.getDefault())) { "conllu", "conll" -> OutputFormat.CONLLU "word2vec", "w2v" -> OutputFormat.WORD2VEC "korapxml", "korap", "xml", "zip" -> OutputFormat.KORAPXML "now", "NOW" -> OutputFormat.NOW else -> throw IllegalArgumentException("Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(", ") { it.name }}") } } } @Option( names = ["--sigle-pattern", "-p"], paramLabel = "PATTERN", description = ["Extract only documents with sigle matching the pattern (regex)"] ) var siglePattern: String? = null @Option( names = ["--extract-attributes-regex", "-e"], paramLabel = "REGEX", description = ["Extract additional attribute values from structure.xml and writes them as comment line in front of the first covered token.", "Example: -e '(posting/id|div/id)'"] ) var extractAttributesRegex: String = "" @Option( names = ["--s-bounds-from-morpho"], description = ["Not yet implemented: s bounds from morpho"] ) var sBoundsFromMorpho: Boolean = false @Option( names = ["--log", "-l"], paramLabel = "LEVEL", description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"] ) var logLevel: String = "WARNING" @Option( names = ["--columns", "-c"], paramLabel = "NUMBER", description = ["Number of columns. 1 means just the token. Default: ${"$"}{DEFAULT-VALUE}", "Possible values: 1-10"] ) var columns: Int = 10 @Option( names = ["--word2vec", "-w"], description = ["Print text in LM training format: tokens separated by space, sentences separated by newline", "Deprecated: use -f word2vec"] ) fun setWord2Vec(word2vec: Boolean) { if (word2vec) { outputFormat = OutputFormat.WORD2VEC } } @Option( names = ["--exclude-zip-glob"], paramLabel = "GLOB", description = [ "Exclude zip files whose basename matches the glob (e.g., 'w?d24.tree_tagger.zip').", "May be repeated. Applied to basenames, not full paths." ] ) var excludeZipGlobs: MutableList<String> = mutableListOf() @Option( names = ["--token-separator", "-s"], paramLabel = "STRING", defaultValue = "\n", description = ["Token separator. Default: new-line for CoNLL-U, space for word2vec format."] ) var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) " " else "\n" @Option(names = ["--offsets"], description = ["Not yet implemented: offsets"]) var offsets: Boolean = false @Option(names = ["--comments", "-C"], description = ["Not yet implemented: comments"]) var comments: Boolean = false @Option( names = ["--extract-metadata-regex", "-m"], paramLabel = "REGEX", description = ["Extract metadata regexes.\nExample: -m '<textSigle>([^<]+)' -m '<creatDate>([^<]+)'"] ) var extractMetadataRegex: MutableList<String> = mutableListOf() @Option( names = ["--annotate-with", "-A"], paramLabel = "COMMAND", description = ["Pipe output through command"] ) var annotateWith: String = "" @Option( names = ["--threads", "-T"], paramLabel = "THREADS", description = ["Maximum number of threads to use. Default: ${"$"}{DEFAULT-VALUE}"] ) var maxThreads: Int = Runtime.getRuntime().availableProcessors() / 2 fun setThreads(threads: Int) { if (threads < 1) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%d' for option '--threads': must be at least 1", threads)) } this.maxThreads = threads System.setProperty("java.util.concurrent.ForkJoinPool.common.parallelism", threads.toString()) } @Option( names = ["--zip-parallelism"], paramLabel = "N", description = ["Maximum number of zip files to process concurrently. Defaults to --threads."] ) var zipParallelism: Int? = null @Option( names = ["--sequential"], description = [ "Process entries inside each zip sequentially; zips processed in parallel (only for word2vec/now)." ] ) var sequentialInZip: Boolean = false @Option( names = ["--overwrite", "-o"], description = ["Overwrite existing files"] ) var overwrite: Boolean = false @Option( names = ["--mem-stats-interval"], paramLabel = "N", description = ["Log memory and cache statistics every N processed documents (0 disables; default: 0)"] ) var memStatsInterval: Int = 0 @Option( names = ["--lemma"], description = ["In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)"] ) var useLemma: Boolean = false @Option( names = ["--lemma-only"], description = [ "Do not load texts from data.xml and output only lemmas (requires morpho.xml).", "Only valid with -f word2vec or -f now; implies --lemma." ] ) var lemmaOnly: Boolean = false private var taggerName: String? = null private var taggerModel: String? = null @Option( names = ["--tag-with", "-t"], paramLabel = "TAGGER:MODEL", description = ["Specify a tagger and a model: ${taggerFoundries}:<path/to/model>."] ) fun setTagWith(tagWith: String) { val pattern: Pattern = Pattern.compile("(${taggerFoundries}):(.+)") val matcher: Matcher = pattern.matcher(tagWith) if (!matcher.matches()) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%s' for option '--tag-with': "+ "value does not match the expected pattern ${taggerFoundries}:<path/to/model>", tagWith)) } else { taggerName = matcher.group(1) taggerModel = matcher.group(2) if (!File(taggerModel).exists()) { throw ParameterException(spec.commandLine(), String.format("Invalid value for option '--tag-with':"+ "model file '%s' does not exist", taggerModel, taggerModel)) } } } private var parserName: String? = null private var parserModel: String? = null @Option( names = ["--parse-with", "-P"], paramLabel = "parser:MODEL", description = ["Specify a parser and a model: ${parserFoundries}:<path/to/model>."] ) fun setParseWith(parseWith: String) { val pattern: Pattern = Pattern.compile("(${parserFoundries}):(.+)") val matcher: Matcher = pattern.matcher(parseWith) if (!matcher.matches()) { throw ParameterException(spec.commandLine(), String.format("Invalid value `%s' for option '--parse-with': "+ "value does not match the expected pattern (${parserFoundries}):<path/to/model>", parseWith)) } else { parserName = matcher.group(1) parserModel = matcher.group(2) if (!File(parserModel).exists()) { throw ParameterException(spec.commandLine(), String.format("Invalid value for option '--parse-with':"+ "model file '%s' does not exist", parserModel, parserModel)) } } } override fun call(): Int { val handler = ConsoleHandler() LogManager.getLogManager().reset() handler.formatter = ColoredFormatter() for (handler in LOGGER.handlers) { LOGGER.removeHandler(handler) } LOGGER.addHandler(handler) LOGGER.level = try { Level.parse(logLevel.uppercase(Locale.getDefault())) } catch (e: IllegalArgumentException) { LOGGER.warning("Invalid log level: $logLevel. Defaulting to WARNING.") Level.WARNING } if (lemmaOnly) { useLemma = true if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) { throw ParameterException(spec.commandLine(), "--lemma-only is supported only with -f word2vec or -f now") } } LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", ")) korapxml2conllu(zipFileNames!!) return 0 } private val LOGGER: Logger = Logger.getLogger(KorapXmlTool::class.java.name) private var annotationWorkerPool : AnnotationWorkerPool? = null // Shared executor for entry-level parallelism across all zips private var entryExecutor: java.util.concurrent.ExecutorService? = null val texts: ConcurrentHashMap<String, NonBmpString> = ConcurrentHashMap() val sentences: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap() val tokens: ConcurrentHashMap<String, Array<Span>> = ConcurrentHashMap() val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap() val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap() val metadata: ConcurrentHashMap<String, Array<String>> = ConcurrentHashMap() val extraFeatures: ConcurrentHashMap<String, MutableMap<String, String>> = ConcurrentHashMap() private val processedDocs = java.util.concurrent.atomic.AtomicInteger(0) var taggerToolBridges: ConcurrentHashMap<Long, TaggerToolBridge?> = ConcurrentHashMap() var parserToolBridges: ConcurrentHashMap<Long, ParserToolBridge?> = ConcurrentHashMap() // Zip progress tracking for logging (zipNumber/zipTotal) private val zipOrdinals: ConcurrentHashMap<String, Int> = ConcurrentHashMap() private var totalZips: Int = 0 private val zipSizes: ConcurrentHashMap<String, Long> = ConcurrentHashMap() private val processedZipBytes: AtomicLong = AtomicLong(0) private var totalZipBytes: Long = 0 private var startTimeMillis: Long = 0 var dbFactory: DocumentBuilderFactory? = null var dBuilder: DocumentBuilder? = null var morphoZipOutputStream: ZipArchiveOutputStream? = null fun String.hasCorrespondingBaseZip(): Boolean { if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return false val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip") return File(baseZip).exists() } fun String.correspondingBaseZip(): String? { if (!this.matches(Regex(".*\\.([^/.]+)\\.zip$"))) return null val baseZip = this.replace(Regex("\\.([^/.]+)\\.zip$"), ".zip") return if (File(baseZip).exists()) baseZip else null } fun korapxml2conllu(args: Array<String>) { if (outputFormat == OutputFormat.KORAPXML && annotateWith.isNotEmpty()) { LOGGER.severe("Shell command annotation is not yet supported with output format $outputFormat") exitProcess(1) } // Initialize shared entry executor (used inside each zip) entryExecutor = Executors.newFixedThreadPool(maxThreads) if (annotateWith.isNotEmpty()) { annotationWorkerPool = AnnotationWorkerPool(annotateWith, maxThreads, LOGGER) } var zips: Array<String> = args if (excludeZipGlobs.isNotEmpty()) { val before = zips.size val patterns = excludeZipGlobs.map { globToRegex(it) } zips = zips.filter { zipPath -> val base = File(zipPath).name patterns.none { rx -> rx.matches(base) } }.toTypedArray() val excluded = before - zips.size if (excluded > 0) { LOGGER.info("Excluded $excluded of $before zip(s) by glob(s): ${excludeZipGlobs.joinToString(", ")}") } } // Initialize zip progress tracking and sizes startTimeMillis = System.currentTimeMillis() processedZipBytes.set(0) totalZips = zips.size zipOrdinals.clear() zipSizes.clear() zips.forEach { zip -> zipSizes[zip] = try { File(zip).length() } catch (_: Exception) { 0L } } totalZipBytes = zipSizes.values.sum() // In lemma-only mode, process largest zips first if (lemmaOnly) { zips = zips.sortedByDescending { zipSizes[it] ?: 0L }.toTypedArray() } zips.forEachIndexed { index, zip -> zipOrdinals[zip] = index + 1 } // Log zip order with sizes so the user can verify sorting val totalHuman = humanBytes(totalZipBytes) LOGGER.info("Zip processing order (${zips.size} file(s), total ${totalHuman}):") zips.forEachIndexed { idx, zip -> val size = zipSizes[zip] ?: 0L LOGGER.info(String.format(Locale.ROOT, "%d/%d: %s (%s)", idx + 1, zips.size, zip, humanBytes(size))) } if (sequentialInZip) { if (outputFormat != OutputFormat.WORD2VEC && outputFormat != OutputFormat.NOW) { throw ParameterException(spec.commandLine(), "--sequential is supported only with -f word2vec or -f now") } } if (maxThreads > 1) { val foundry = getFoundryFromZipFileNames(zips) val parallelism = (zipParallelism ?: maxThreads).coerceAtLeast(1) LOGGER.info("Processing zips with ordered queue; parallelism=$parallelism; entries ${if (sequentialInZip) "sequential" else "parallel"}") processZipsWithQueue(zips, foundry, parallelism) } else { LOGGER.info("Processing zip files sequentially") Arrays.stream(zips).forEachOrdered { zipFilePath -> processZipFileSequentially((zipFilePath ?: "").toString(), getFoundryFromZipFileNames(zips)) } } if (annotationWorkerPool != null) { LOGGER.info("closing worker pool") annotationWorkerPool?.close() } // Shutdown entry executor entryExecutor?.shutdown() } private fun processZipsWithQueue(zips: Array<String>, foundry: String, parallelism: Int) { val queue: java.util.concurrent.BlockingQueue<String> = java.util.concurrent.LinkedBlockingQueue() zips.forEach { queue.put(it) } val executor = Executors.newFixedThreadPool(parallelism) val active = java.util.concurrent.atomic.AtomicInteger(0) repeat(parallelism) { executor.submit { active.incrementAndGet() try { while (true) { val zipPath = queue.poll(100, java.util.concurrent.TimeUnit.MILLISECONDS) if (zipPath == null) { if (queue.isEmpty()) break else continue } if (sequentialInZip) { processZipFileSequentially(zipPath, foundry) } else { processZipFile(zipPath, foundry) } } } finally { active.decrementAndGet() } } } executor.shutdown() try { executor.awaitTermination(7, java.util.concurrent.TimeUnit.DAYS) } catch (ie: InterruptedException) { Thread.currentThread().interrupt() } } // Convert a shell-like glob to a Regex: '*' -> ".*", '?' -> '.', anchored full match private fun globToRegex(glob: String): Regex { val sb = StringBuilder("^") glob.forEach { ch -> when (ch) { '*' -> sb.append(".*") '?' -> sb.append('.') '.', '(', ')', '+', '|', '^', '$', '@', '%', '{', '}', '[', ']', '\\' -> sb.append('\\').append(ch) else -> sb.append(ch) } } sb.append('$') return Regex(sb.toString()) } private fun getTokenSpansFromMorho(morpho: MutableMap<String, MorphoSpan>): Array<Span> { return morpho.keys.map { key -> val fromTo = key.split("-") Span(fromTo[0].toInt(), fromTo[1].toInt()) }.sortedBy { it.from }.toTypedArray() } private fun getFoundryFromZipFileName(zipFileName: String): String { if (!zipFileName.matches(Regex(".*\\.([^/.]+)\\.zip$"))) { return "base" } return zipFileName.replace(Regex(".*\\.([^/.]+)\\.zip$"), "$1") } private fun getFoundryFromZipFileNames(zipFileNames: Array<String>): String { for (zipFileName in zipFileNames) { val foundry = getFoundryFromZipFileName(zipFileName) if (foundry != "base") { return foundry } } return "base" } private fun processZipFile(zipFilePath: String, foundry: String = "base") { val ord = zipOrdinals[zipFilePath] ?: 0 val size = zipSizes[zipFilePath] ?: 0L LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}") LOGGER.info("Foundry: $foundry $dbFactory") if (outputFormat == OutputFormat.KORAPXML && dbFactory == null) { var targetFoundry = "base" if (taggerName != null) { val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge? if (tagger != null) { targetFoundry = tagger.foundry } } else if (parserName != null) { targetFoundry = parserName!! } dbFactory = DocumentBuilderFactory.newInstance() dBuilder = dbFactory!!.newDocumentBuilder() val outputMorphoZipFileName = if (parserName != null) zipFilePath.replace(Regex("(\\.(opennlp|marmot|tree_tagger|corenlp|spacy))?\\.zip$"), ".".plus(parserName).plus(".zip")) else zipFilePath.replace(Regex("\\.zip$"), ".".plus(targetFoundry).plus(".zip")) if (File(outputMorphoZipFileName).exists() && !overwrite) { LOGGER.severe("Output file $outputMorphoZipFileName already exists. Use --overwrite to overwrite.") exitProcess(1) } val fileOutputStream = FileOutputStream(outputMorphoZipFileName) morphoZipOutputStream = ZipArchiveOutputStream(fileOutputStream).apply { setUseZip64(Zip64Mode.Always) } } if (zipFilePath.hasCorrespondingBaseZip()) { val relatedZips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!) // Process related zips one after another to keep the ZipFile lifetime strictly bounded relatedZips.forEach { zip -> ZipFile(zip).use { zipFile -> processZipEntriesWithPool(zipFile, foundry, true) } } } else { ZipFile(zipFilePath).use { zipFile -> processZipEntriesWithPool(zipFile, foundry, false) } } if (outputFormat == OutputFormat.KORAPXML) { morphoZipOutputStream!!.close() } logZipProgress(zipFilePath) } private fun processZipFileSequentially(zipFilePath: String, foundry: String = "base") { val ord = zipOrdinals[zipFilePath] ?: 0 val size = zipSizes[zipFilePath] ?: 0L LOGGER.info("Processing zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} (${humanBytes(size)}) in thread ${Thread.currentThread().threadId()}") if (zipFilePath.hasCorrespondingBaseZip()) { // Process the two related zips strictly sequentially to limit memory growth val zips = arrayOf(zipFilePath, zipFilePath.correspondingBaseZip()!!) zips.forEach { zip -> ZipFile(zip).use { zipFile -> // Iterate entries in a deterministic order to keep related files close together zipFile.stream() .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") } .sorted(Comparator.comparing<ZipEntry, String> { it.name }) .forEachOrdered { zipEntry -> processZipEntry(zipFile, foundry, zipEntry, true) } } } } else { ZipFile(zipFilePath).use { zipFile -> zipFile.stream() .filter { extractMetadataRegex.isNotEmpty() || !it.name.contains("header.xml") } .sorted(Comparator.comparing<ZipEntry, String> { it.name }) .forEachOrdered { zipEntry -> processZipEntry(zipFile, foundry, zipEntry, false) } } } logZipProgress(zipFilePath) } private fun logZipProgress(zipFilePath: String) { try { val size = zipSizes[zipFilePath] ?: 0L val done = processedZipBytes.addAndGet(size) val total = if (totalZipBytes > 0) totalZipBytes else 1L val elapsedMs = (System.currentTimeMillis() - startTimeMillis).coerceAtLeast(1) val speedBytesPerSec = (done * 1000.0) / elapsedMs val remaining = (total - done).coerceAtLeast(0) val etaSeconds = if (speedBytesPerSec > 0.0) (remaining / speedBytesPerSec).toLong() else -1L val ord = zipOrdinals[zipFilePath] ?: 0 val pct = (done * 100.0 / total).coerceIn(0.0, 100.0) val humanSpeed = String.format(Locale.ROOT, "%.2f MB/s", speedBytesPerSec / (1024.0 * 1024.0)) val etaStr = if (etaSeconds >= 0) formatDuration(etaSeconds) else "unknown" LOGGER.info( "Finished zip ${if (ord>0) ord else "?"}/$totalZips: ${zipFilePath} " + "(${humanBytes(size)}). Progress: ${String.format(Locale.ROOT, "%.1f", pct)}%%, " + "ETA ${etaStr} at ${humanSpeed}" ) } catch (e: Exception) { LOGGER.fine("Failed to log zip progress for $zipFilePath: ${e.message}") } } private fun humanBytes(bytes: Long): String { if (bytes < 1024) return "$bytes B" val kb = bytes / 1024.0 if (kb < 1024) return String.format(Locale.ROOT, "%.1f KB", kb) val mb = kb / 1024.0 if (mb < 1024) return String.format(Locale.ROOT, "%.1f MB", mb) val gb = mb / 1024.0 return String.format(Locale.ROOT, "%.1f GB", gb) } private fun formatDuration(seconds: Long): String { var s = seconds val h = s / 3600; s %= 3600 val m = s / 60; val sec = s % 60 return String.format(Locale.ROOT, "%02d:%02d:%02d", h, m, sec) } private fun processZipEntriesWithPool(zipFile: ZipFile, foundry: String, waitForMorpho: Boolean) { // Collect entries first to avoid lazy evaluation surprises, filter header.xml unless metadata extraction is requested val entries: MutableList<ZipEntry> = ArrayList() val enumEntries = zipFile.entries() while (enumEntries.hasMoreElements()) { val e = enumEntries.nextElement() if (extractMetadataRegex.isEmpty() && e.name.contains("header.xml")) continue entries.add(e) } if (entries.isEmpty()) return // If only one thread requested, do sequential to avoid pool overhead if (maxThreads <= 1) { entries.forEach { entry -> processZipEntry(zipFile, foundry, entry, waitForMorpho) } return } // Submit all entry tasks to the shared executor and await completion before closing the zip val latch = java.util.concurrent.CountDownLatch(entries.size) entries.forEach { entry -> entryExecutor?.execute { try { processZipEntry(zipFile, foundry, entry, waitForMorpho) } catch (t: Throwable) { LOGGER.warning("Failed to process entry ${entry.name}: ${t.message}") } finally { latch.countDown() } } } try { latch.await() } catch (ie: InterruptedException) { Thread.currentThread().interrupt() } } fun processZipEntry(zipFile: ZipFile, _foundry: String, zipEntry: ZipEntry, passedWaitForMorpho: Boolean) { var foundry = _foundry var waitForMorpho = passedWaitForMorpho LOGGER.finer("Processing ${zipEntry.name} in thread ${Thread.currentThread().threadId()}") if (taggerName != null && !taggerToolBridges.containsKey(Thread.currentThread().threadId())) { val tagger = AnnotationToolBridgeFactory.getAnnotationToolBridge(taggerName!!, taggerModel!!, LOGGER) as TaggerToolBridge? taggerToolBridges[Thread.currentThread().threadId()] = tagger if (tagger != null) { foundry = tagger.foundry } } if (parserName != null && !parserToolBridges.containsKey(Thread.currentThread().threadId())) { val parser = AnnotationToolBridgeFactory.getAnnotationToolBridge(parserName!!, parserModel!!, LOGGER) as ParserToolBridge? parserToolBridges[Thread.currentThread().threadId()] = parser if (parser != null) { foundry = "$foundry dependency:${parser.foundry}" LOGGER.fine("Initialized parser ${parserName} with foundry $foundry in thread ${Thread.currentThread().threadId()}") } } try { if (zipEntry.name.matches(Regex(".*(data|tokens|structure|morpho)\\.xml$"))) { // Ensure the entry stream and reader are closed to avoid native memory buildup val dbFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance() val dBuilder: DocumentBuilder = dbFactory.newDocumentBuilder() // In lemma-only mode, skip parsing data.xml entirely to reduce memory pressure if (lemmaOnly && zipEntry.name.endsWith("data.xml")) { return } val doc: Document = try { zipFile.getInputStream(zipEntry).use { inputStream -> XMLCommentFilterReader(inputStream, "UTF-8").use { reader -> dBuilder.parse(InputSource(reader)) } } } catch (e: SAXParseException) { LOGGER.warning("Error parsing file: " + zipEntry.name + " " + e.message) return } doc.documentElement.normalize() val docId: String = doc.documentElement.getAttribute("docid") if (siglePattern != null && !Regex(siglePattern!!).containsMatchIn(docId)) { return } // LOGGER.info("Processing file: " + zipEntry.getName()) val fileName = zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1") when (fileName) { "data.xml" -> { if (!lemmaOnly) { val textsList: NodeList = doc.getElementsByTagName("text") if (textsList.length > 0) { texts[docId] = NonBmpString(textsList.item(0).textContent) } } } "structure.xml" -> { val spans: NodeList = doc.getElementsByTagName("span") if (extractAttributesRegex.isNotEmpty()) extraFeatures[docId] = extractMiscSpans(spans) sentences[docId] = extractSentenceSpans(spans) } "tokens.xml" -> { if (!fnames.contains(docId)) { fnames[docId] = zipEntry.name } val tokenSpans: NodeList = doc.getElementsByTagName("span") tokens[docId] = extractSpans(tokenSpans) } "morpho.xml" -> { waitForMorpho = true fnames[docId] = zipEntry.name val fsSpans: NodeList = doc.getElementsByTagName("span") morpho[docId] = extractMorphoSpans(fsSpans) tokens[docId] = extractSpans(fsSpans) } } val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML // For lemma-only/lemma-based word2vec/now, we can proceed without full text val textRequired = when (outputFormat) { OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly) else -> true } if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null && (!morphoRequired || morpho[docId] != null) && (extractMetadataRegex.isEmpty() || metadata[docId] != null) ) { // Be quiet on INFO; per-text logs only on FINE and below LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().threadId()}") processText(docId, foundry) } } else if (extractMetadataRegex.isNotEmpty() && zipEntry.name.matches(Regex(".*/header\\.xml$"))) { //LOGGER.info("Processing header file: " + zipEntry.name) val text = zipFile.getInputStream(zipEntry).bufferedReader().use { it.readText() } val docId = Regex("<textSigle>([^<]+)</textSigle>").find(text)?.destructured?.component1() ?.replace(Regex("/"), "_") LOGGER.fine("Processing header file: " + zipEntry.name + " docId: " + docId) val meta = ArrayList<String>() extractMetadataRegex.forEach { regex -> val match = Regex(regex).find(text) if (match != null) { meta.add(match.destructured.component1()) } } if (meta.isNotEmpty() && docId != null) { metadata[docId] = meta.toTypedArray() val morphoRequired = waitForMorpho || useLemma || taggerName != null || parserName != null || outputFormat == OutputFormat.KORAPXML val textRequired = when (outputFormat) { OutputFormat.WORD2VEC, OutputFormat.NOW -> !(useLemma || lemmaOnly) else -> true } if ((texts[docId] != null || !textRequired) && sentences[docId] != null && tokens[docId] != null && (!morphoRequired || morpho[docId] != null) ) { // Be quiet on INFO; per-text logs only on FINE and below LOGGER.fine("Processing text (meta-ready): $docId in thread ${Thread.currentThread().threadId()}") processText(docId, foundry) } } } } catch (e: Exception) { e.printStackTrace() } } private fun processText( docId: String, foundry: String, ) { LOGGER.fine("Processing text: $docId in thread ${Thread.currentThread().threadId()}") var morphoFoundry = getMorphoFoundry() val output = if (outputFormat == OutputFormat.WORD2VEC) { lmTrainingOutput(docId) } else if (outputFormat == OutputFormat.NOW) { nowOutput(docId) } else { if (taggerToolBridges[Thread.currentThread().threadId()] != null) { morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText( tokens[docId]!!, sentences[docId], texts[docId]!! ) } if (parserToolBridges[Thread.currentThread().threadId()] != null) { if (morpho[docId] == null) { LOGGER.severe("No morpho data for $docId") //exitProcess(1) } LOGGER.finer("Parsing text: $docId in thread ${Thread.currentThread().threadId()}") morpho[docId] = parserToolBridges[Thread.currentThread().threadId()]!!.parseText( tokens[docId]!!, morpho[docId], sentences[docId], texts[docId]!! ) LOGGER.finer("Parsed text: $docId in thread ${Thread.currentThread().threadId()}") } if (outputFormat == OutputFormat.KORAPXML && annotationWorkerPool == null) { korapXmlOutput(getMorphoFoundry(), docId) } else { conlluOutput(foundry, docId) } } if (annotationWorkerPool != null) { annotationWorkerPool?.pushToQueue(output.append("\n# eot\n").toString()) // Release internal char[] early output.setLength(0) } else if (outputFormat != OutputFormat.KORAPXML) { synchronized(System.out) { println(output.toString()) } // Release internal char[] early output.setLength(0) } else { korapXmlOutput(foundry, docId) } arrayOf(tokens, texts, sentences, morpho, fnames, metadata, extraFeatures).forEach { map -> if (map === morpho) { // Clear inner map to release references early morpho[docId]?.clear() } map.remove(docId) } // Periodic GC hint after processing many docs (lightweight safeguard) if ((processedDocs.incrementAndGet() % 2000) == 0) { LOGGER.fine("Processed ${processedDocs.get()} docs – requesting GC hint") System.gc() } // Memory / cache statistics logging if (memStatsInterval > 0) { val count = processedDocs.get() if (count % memStatsInterval == 0) { logMemoryStats(count) } } if (outputFormat == OutputFormat.KORAPXML) { val entryPath = if (parserName != null) docId.replace(Regex("[_.]"), "/").plus("/$parserName/").plus("dependency.xml") else docId.replace(Regex("[_.]"), "/").plus("/$morphoFoundry/").plus("morpho.xml") val zipEntry = ZipArchiveEntry(entryPath) zipEntry.unixMode = ZIP_ENTRY_UNIX_MODE synchronized(morphoZipOutputStream!!) { morphoZipOutputStream!!.putArchiveEntry(zipEntry) morphoZipOutputStream!!.write(output.toString().toByteArray()) morphoZipOutputStream!!.closeArchiveEntry() } output.clear() } } private fun getMorphoFoundry() = taggerToolBridges[Thread.currentThread().threadId()]?.foundry ?: "base" private fun logMemoryStats(count: Int) { try { val rt = Runtime.getRuntime() val used = (rt.totalMemory() - rt.freeMemory()) / (1024 * 1024) val total = rt.totalMemory() / (1024 * 1024) val max = rt.maxMemory() / (1024 * 1024) LOGGER.info( "MEM-STATS docs=${count} usedMB=${used} totalMB=${total} maxMB=${max} " + "maps{texts=${texts.size},tokens=${tokens.size},sentences=${sentences.size},morpho=${morpho.size}}" ) } catch (e: Exception) { LOGGER.warning("Failed to log memory stats: ${e.message}") } } private fun korapXmlDependencyOutput(foundry: String, docId: String): StringBuilder { val doc: Document = dBuilder!!.newDocument() // Root element val layer = doc.createElement("layer") layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP") layer.setAttribute("version", "KorAP-0.4") layer.setAttribute("docid", docId) doc.appendChild(layer) val spanList = doc.createElement("spanList") layer.appendChild(spanList) var i = 0 var s = 0 var n = 0 val sortedKeys = morpho[docId]?.keys?.sortedBy { it.split("-")[0].toInt() } sortedKeys?.forEach { spanString -> val mfs = morpho[docId]?.get(spanString) val offsets = spanString.split("-") if(offsets.size != 2) { LOGGER.warning("Invalid span: $spanString in $docId") return@forEach } if (offsets[0].toInt() > sentences[docId]!!.elementAt(s).to) { s++ n = i } i++ if (mfs!!.deprel == "_") { return@forEach } val spanNode = doc.createElement("span") spanNode.setAttribute("id", "s${s + 1}_n${i - n}") spanNode.setAttribute("from", offsets[0]) spanNode.setAttribute("to", offsets[1]) // rel element val rel = doc.createElement("rel") rel.setAttribute("label", mfs.deprel) // inner span element val innerSpan = doc.createElement("span") val headInt = if(mfs.head == "_") 0 else parseInt(mfs.head) - 1 if (headInt < 0) { innerSpan.setAttribute("from", sentences[docId]!!.elementAt(s).from.toString()) innerSpan.setAttribute("to", sentences[docId]!!.elementAt(s).to.toString()) } else { if (headInt + n >= morpho[docId]!!.size) { LOGGER.warning("Head index out of bounds: ${headInt+n} >= ${morpho[docId]!!.size} in $docId") return@forEach } else { val destSpanString = sortedKeys.elementAt(headInt + n) val destOffsets = destSpanString.split("-") innerSpan.setAttribute("from", destOffsets[0]) innerSpan.setAttribute("to", destOffsets[1]) } } rel.appendChild(innerSpan) spanNode.appendChild(rel) spanList.appendChild(spanNode) } val transformerFactory = TransformerFactory.newInstance() val transformer = transformerFactory.newTransformer() transformer.setOutputProperty(OutputKeys.INDENT, "yes") transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no") transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1") val domSource = DOMSource(doc) val streamResult = StreamResult(StringWriter()) transformer.transform(domSource, streamResult) return StringBuilder(streamResult.writer.toString()) } private fun korapXmlOutput(foundry: String, docId: String): StringBuilder { return if (parserName != null) { korapXmlDependencyOutput(foundry, docId) } else { korapXmlMorphoOutput(foundry, docId) } } private fun korapXmlMorphoOutput(foundry: String, docId: String): StringBuilder { val doc: Document = dBuilder!!.newDocument() // Root element val layer = doc.createElement("layer") layer.setAttribute("xmlns", "http://ids-mannheim.de/ns/KorAP") layer.setAttribute("version", "KorAP-0.4") layer.setAttribute("docid", docId) doc.appendChild(layer) val spanList = doc.createElement("spanList") layer.appendChild(spanList) var i = 0 morpho[docId]?.forEach { (spanString, mfs) -> i++ val offsets = spanString.split("-") val spanNode = doc.createElement("span") spanNode.setAttribute("id", "t_$i") spanNode.setAttribute("from", offsets[0]) spanNode.setAttribute("to", offsets[1]) // fs element val fs = doc.createElement("fs") fs.setAttribute("type", "lex") fs.setAttribute("xmlns", "http://www.tei-c.org/ns/1.0") spanNode.appendChild(fs) val f = doc.createElement("f") f.setAttribute("name", "lex") fs.appendChild(f) // Inner fs element val innerFs = doc.createElement("fs") f.appendChild(innerFs) if (mfs.lemma != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "lemma") innerF.textContent = mfs.lemma innerFs.appendChild(innerF) } if (mfs.upos != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "upos") innerF.textContent = mfs.upos innerFs.appendChild(innerF) } if (mfs.xpos != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "pos") innerF.textContent = mfs.xpos innerFs.appendChild(innerF) } if (mfs.feats != "_") { val innerF = doc.createElement("f") innerF.setAttribute("name", "msd") innerF.textContent = mfs.feats innerFs.appendChild(innerF) } if (mfs.misc != "_" && mfs.misc!!.matches(Regex("^[0-9.]+$"))) { val innerF = doc.createElement("f") innerF.setAttribute("name", "certainty") innerF.textContent = mfs.misc innerFs.appendChild(innerF) } spanList.appendChild(spanNode) } val transformerFactory = TransformerFactory.newInstance() val transformer = transformerFactory.newTransformer() transformer.setOutputProperty(OutputKeys.INDENT, "yes") transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no") transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "1") val domSource = DOMSource(doc) val streamResult = StreamResult(StringWriter()) transformer.transform(domSource, streamResult) return StringBuilder(streamResult.writer.toString()) } private fun conlluOutput(foundry: String, docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder output = StringBuilder("# foundry = $foundry\n# filename = ${fnames[docId]}\n# text_id = $docId\n").append( tokenOffsetsInSentence( sentences, docId, sentence_index, real_token_index, tokens ) ) if (extractMetadataRegex.isNotEmpty()) { output.append(metadata[docId]?.joinToString("\t", prefix = "# metadata=", postfix = "\n") ?: "") } var previousSpanStart = 0 tokens[docId]?.forEach { span -> token_index++ if (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to) { output.append("\n") sentence_index++ token_index = 1 output.append( tokenOffsetsInSentence( sentences, docId, sentence_index, real_token_index, tokens ) ) } if (extractAttributesRegex.isNotEmpty() && extraFeatures[docId] != null) { for (i in previousSpanStart until span.from + 1) { if (extraFeatures[docId]?.containsKey("$i") == true) { output.append(extraFeatures[docId]!!["$i"]) extraFeatures[docId]!!.remove("$i") } } previousSpanStart = span.from + 1 } if (morpho[docId]?.containsKey("${span.from}-${span.to}") == true) { val mfs = morpho[docId]!!["${span.from}-${span.to}"] if (span.to > texts[docId]!!.length) { span.to = texts[docId]!!.length LOGGER.warning( "Offset error: could not retrieve token at ${span.from}-${span.to} – ending with: ${ texts[docId]!!.substring( span.from, span.to ) }" ) } output.append( printConlluToken( token_index, texts[docId]!!.substring(span.from, span.to), mfs!!.lemma!!, mfs.upos!!, mfs.xpos!!, mfs.feats!!, mfs.head!!, mfs.deprel!!, mfs.deps!!, mfs.misc!!, columns ) ) } else { output.append( printConlluToken( token_index, texts[docId]!!.substring(span.from, span.to), columns = columns ) ) } real_token_index++ } return output } private fun lmTrainingOutput(docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder output = StringBuilder() if (extractMetadataRegex.isNotEmpty()) { output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "") } // If no text is available (e.g., lemma-only mode), emit lemmas if (texts[docId] == null) { tokens[docId]?.forEach { span -> val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]?.get(key)?.lemma output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ") } if (output.isNotEmpty()) output.deleteCharAt(output.length - 1) return output } tokens[docId]?.forEach { span -> token_index++ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { if (output.isNotEmpty()) { output.setCharAt(output.length - 1, '\n') } else { output.append("\n") } if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) { output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "") } sentence_index++ } // Bounds safety val safeFrom = span.from.coerceIn(0, texts[docId]!!.length) val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length) if (useLemma && morpho[docId] != null) { val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]!![key]?.lemma if (lemmaVal != null && lemmaVal != "_") { output.append(lemmaVal) output.append(' ') } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } real_token_index++ } if (output.isNotEmpty()) { output.deleteCharAt(output.length - 1) } return output } private fun nowOutput(docId: String): StringBuilder { var token_index = 0 var real_token_index = 0 var sentence_index = 0 val output: StringBuilder = StringBuilder() // Add the text sigle prefix output.append("@@$docId ") if (texts[docId] == null) { // Lemma-only fallback when original text is not loaded tokens[docId]?.forEach { span -> if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { if (output.isNotEmpty() && !output.endsWith("@@$docId ")) { output.append(" <p> ") } sentence_index++ } val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]?.get(key)?.lemma output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ") } if (output.isNotEmpty() && output.endsWith(" ")) { output.deleteCharAt(output.length - 1) } return output } tokens[docId]?.forEach { span -> token_index++ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) { // Replace sentence end with <p> tag instead of newline if (output.isNotEmpty() && !output.endsWith("@@$docId ")) { output.append(" <p> ") } sentence_index++ } // Bounds safety val safeFrom = span.from.coerceIn(0, texts[docId]!!.length) val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length) if (useLemma && morpho[docId] != null) { val key = "${span.from}-${span.to}" val lemmaVal = morpho[docId]!![key]?.lemma if (lemmaVal != null && lemmaVal != "_") { output.append(lemmaVal) output.append(' ') } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } } else { texts[docId]!!.appendRangeTo(output, safeFrom, safeTo) output.append(' ') } real_token_index++ } // Remove trailing space and add final newline if (output.isNotEmpty() && output.endsWith(" ")) { output.deleteCharAt(output.length - 1) } return output } private fun printConlluToken( token_index: Int, token: String, lemma: String = "_", upos: String = "_", xpos: String = "_", feats: String = "_", head: String = "_", deprel: String = "_", deps: String = "_", misc: String = "_", columns: Int = 10 ): String { val myUpos = if (COMPATIBILITY_MODE && upos == "_") xpos else upos return when (columns) { 1 -> ("$token\n") 10 -> ("$token_index\t$token\t$lemma\t$myUpos\t$xpos\t$feats\t$head\t$deprel\t$deps\t$misc$tokenSeparator") else -> { val fields = listOf( token_index.toString(), token, lemma, myUpos, xpos, feats, head, deprel, deps, misc ) fields.subList(0, min(columns, 10)).joinToString("\t", postfix = tokenSeparator) } } } private fun tokenOffsetsInSentence( sentences: ConcurrentHashMap<String, Array<Span>>, docId: String, sentence_index: Int, token_index: Int, tokens: ConcurrentHashMap<String, Array<Span>> ): String { if (sentences[docId] == null || sentences[docId]!!.size <= sentence_index) { return "" } val sentenceEndOffset = sentences[docId]!![sentence_index].to var i = token_index val start_offsets_string = StringBuilder() val end_offsets_string = StringBuilder() while (tokens[docId] != null && i < tokens[docId]!!.size && tokens[docId]!![i].to <= sentenceEndOffset) { start_offsets_string.append(" ", tokens[docId]!![i].from) end_offsets_string.append(" ", tokens[docId]!![i].to) i++ } return ( StringBuilder() .append( "# start_offsets = ", tokens[docId]!![token_index].from, start_offsets_string, "\n", "# end_offsets = ", sentenceEndOffset, end_offsets_string, "\n" ).toString()) } private fun extractSpans(spans: NodeList): Array<Span> { val list = ArrayList<Span>() IntStream.range(0, spans.length).forEach { idx -> val node = spans.item(idx) if (node is Element) { val fromAttr = node.getAttribute("from") val toAttr = node.getAttribute("to") if (fromAttr.isNullOrEmpty() || toAttr.isNullOrEmpty()) { LOGGER.warning("Skipping span with empty from/to attribute: from='$fromAttr' to='$toAttr'") } else { try { val from = Integer.parseInt(fromAttr) val to = Integer.parseInt(toAttr) list.add(Span(from, to)) } catch (e: NumberFormatException) { LOGGER.warning("Skipping span with invalid numeric offsets: from='$fromAttr' to='$toAttr' : ${e.message}") } } } } return list.toTypedArray() } private fun extractMorphoSpans( fsSpans: NodeList ): MutableMap<String, MorphoSpan> { val UNKNOWN = Regex("(UNKNOWN|<unknown>)") val res: MutableMap<String, MorphoSpan> = HashMap() IntStream.range(0, fsSpans.length).mapToObj(fsSpans::item).filter { node -> node is Element && node.getAttribute("type") != "alt" }.forEach { node -> val features = (node as Element).getElementsByTagName("f") val fs = MorphoSpan() val fromTo = "${node.getAttribute("from")}-${node.getAttribute("to")}" IntStream.range(0, features.length).mapToObj(features::item).forEach { feature -> val attr = (feature as Element).getAttribute("name") val value = feature.textContent.trim() if (value.isEmpty()) return@forEach when (attr) { "lemma" -> if(fs.lemma == "_") fs.lemma = value.replace(UNKNOWN, "--") "upos" -> fs.upos = value "xpos", "ctag", "pos" -> if(fs.xpos == "_") fs.xpos = value.replace(UNKNOWN, "--") "feats", "msd" -> if(fs.feats == "_" ) fs.feats = value "type" -> if(fs.feats == "_") fs.feats = feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent.trim() // "subtype" -> if(fs.feats == "_") fs.feats += ":" + feature.getElementsByTagName("symbol").item(0).attributes.getNamedItem("value").textContent "certainty" -> if(fs.misc == "_") fs.misc = value } } res[fromTo] = fs } return res } private fun extractSentenceSpans(spans: NodeList): Array<Span> { return IntStream.range(0, spans.length).mapToObj(spans::item) .filter { node -> node is Element && node.getElementsByTagName("f").item(0).textContent.equals("s") } .map { node -> Span( Integer.parseInt((node as Element).getAttribute("from")), Integer.parseInt(node.getAttribute("to")) ) }.toArray { size -> arrayOfNulls(size) } } /* <span id="s15" from="370" to="394" l="5"> <fs type="struct" xmlns="http://www.tei-c.org/ns/1.0"> <f name="name">posting</f> <f name="attr"> <fs type="attr"> <f name="id">i.10894_1_3</f> <f name="indentLevel">0</f> <f name="who">WU00000000</f> </fs> </f> </fs> </span> */ private fun extractMiscSpans(spans: NodeList): MutableMap<String, String> { val miscLocal: MutableMap<String, String> = HashMap() IntStream.range(0, spans.length).mapToObj(spans::item) .filter { node -> node is Element && node.getElementsByTagName("f").length > 1 && (node.getElementsByTagName("f").item(0) as Element).getAttribute("name").equals("name") && (node.getElementsByTagName("f").item(1) as Element).getAttribute("name").equals("attr") } .forEach { node -> if (node == null) return@forEach val elementName = (node as Element).getElementsByTagName("f").item(0).textContent.trim() val from = node.getAttribute("from") val attributes = (node.getElementsByTagName("f").item(1) as Element).getElementsByTagName("f") val res = StringBuilder() IntStream.range(0, attributes.length).mapToObj(attributes::item).forEach { attr -> val attrName = "$elementName/${(attr as Element).getAttribute("name")}" if (attrName.matches(Regex(extractAttributesRegex))) { res.append("# $attrName = ${attr.textContent}\n") //LOGGER.info("" + from + ": $attrName = " + attr.textContent) } } if (res.isNotEmpty()) { if (miscLocal.containsKey(from)) { // LOGGER.info("ADDING TO $from: ${miscLocal[from]}") miscLocal[from] += res.toString() } else { miscLocal[from] = res.toString() } } } return miscLocal } class Span(var from: Int, var to: Int) class MorphoSpan( var lemma: String? = "_", var upos: String? = "_", var xpos: String? = "_", var feats: String? = "_", var head: String? = "_", var deprel: String? = "_", var deps: String? = "_", var misc: String? = "_" ) } fun main(args: Array<String>): Unit = exitProcess(CommandLine(KorapXmlTool()).execute(*args)) fun debug(args: Array<String>): Int { return (CommandLine(KorapXmlTool()).execute(*args)) } enum class OutputFormat { CONLLU, WORD2VEC, KORAPXML, NOW } object ConlluOutputFormat { const val NAME = "conllu" } object Word2VecOutputFormat { const val NAME = "word2vec" } object KorapXmlOutputFormat { const val NAME = "korapxml" } object NowOutputFormat { const val NAME = "now" }" />
- </PendingDiffInfo>
- </value>
- </entry>
- <entry key="$PROJECT_DIR$/build.gradle">
- <value>
- <PendingDiffInfo>
- <option name="filePath" value="$PROJECT_DIR$/build.gradle" />
- <option name="originalContent" value="repositories { flatDir { dirs("libs") } } " />
- <option name="updatedContent" value="repositories { flatDir { dirs("libs") } } // Zentrale Projektversion für korapxmltool version = '2.0-beta-02'" />
- </PendingDiffInfo>
- </value>
- </entry>
- </map>
- </option>
- </component>
-</project>
\ No newline at end of file
diff --git a/app/build.gradle b/app/build.gradle
index ca078d2..701a18a 100644
--- a/app/build.gradle
+++ b/app/build.gradle
@@ -72,33 +72,51 @@
}
}
+// Version des Subprojekts explizit von Root erben
+version = rootProject.version
+
application {
// Define the main class for the application.
mainClass = 'de.ids_mannheim.korapxmltools.KorapXmlToolKt'
}
jar {
- // Will include every single one of your dependencies, project or not
- // def lowerCasedName = baseName.toLowerCase()
- // def normalizedName = lowerCasedName.substring(0,1).toUpperCase() + lowerCasedName.substring(1)
-
+ // Standard-JAR als "plain" kennzeichnen, um Konflikte mit ShadowJar zu vermeiden
+ archiveClassifier.set('plain')
manifest.attributes(
'Class-Path': configurations.compileClasspath.collect { it.getName() }.join(' '),
'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt",
'Implementation-Title': rootProject.name,
'Implementation-Version': project.version
)
- shadowJar {
- archiveBaseName.set('korapxmltool')
- archiveClassifier.set('')
- // Version ins Dateinamen aufnehmen
- archiveVersion.set(project.version.toString())
- manifest.attributes(
- 'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt",
- 'Implementation-Title': rootProject.name,
- 'Implementation-Version': project.version
- )
- }
+}
+
+shadowJar {
+ archiveBaseName.set('korapxmltool')
+ archiveClassifier.set('')
+ archiveVersion.set(project.version.toString())
+ manifest.attributes(
+ 'Main-Class': "de.ids_mannheim.korapxmltools.KorapXmlToolKt",
+ 'Implementation-Title': rootProject.name,
+ 'Implementation-Version': project.version
+ )
+}
+
+// Stelle sicher, dass assemble auch den ShadowJar erzeugt
+tasks.named('assemble') {
+ dependsOn tasks.named('shadowJar')
+}
+
+// Erzeuge zusätzlich eine nicht-versionierte Kopie korapxmltool.jar für stabile Skriptpfade
+tasks.register('shadowJarLatest', Copy) {
+ dependsOn tasks.named('shadowJar')
+ from({ tasks.shadowJar.get().archiveFile.get().asFile })
+ into({ tasks.shadowJar.get().destinationDirectory.get().asFile })
+ rename { String _ -> 'korapxmltool.jar' }
+}
+
+tasks.named('build') {
+ dependsOn tasks.named('shadowJarLatest')
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 67d4a95..a9b1333 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -46,7 +46,7 @@
@Command(
name = "KorapXmlTool",
mixinStandardHelpOptions = true,
- version = ["KorapXmlTool 2.0-beta-02"],
+ version = ["KorapXmlTool 2.01"],
description = ["Converts KorAP-XML <https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or " +
"morpho zips to (annotated) CoNLL(-U) format with all information necessary for " +
"reconstruction in comment lines."]
diff --git a/build.gradle b/build.gradle
index 56cf4ab..ed0d174 100644
--- a/build.gradle
+++ b/build.gradle
@@ -5,4 +5,4 @@
}
// Zentrale Projektversion für korapxmltool
-version = '2.0-beta-02'
+version = '2.01'