Extract some formatters
Change-Id: I56130674cc9c816057e80f4e27b91f598e21fe93
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index a9559ba..002c40c 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -1906,9 +1906,9 @@
val output =
if (outputFormat == OutputFormat.WORD2VEC) {
- lmTrainingOutput(docId)
+ formatWord2VecOutput(docId)
} else if (outputFormat == OutputFormat.NOW) {
- nowOutput(docId)
+ formatNowOutput(docId)
} else {
if (taggerToolBridges[Thread.currentThread().threadId()] != null) {
morpho[docId] = taggerToolBridges[Thread.currentThread().threadId()]!!.tagText(
@@ -2559,100 +2559,41 @@
return comments
}
- private fun lmTrainingOutput(docId: String): StringBuilder {
- var token_index = 0
- var real_token_index = 0
- var sentence_index = 0
- val output = StringBuilder()
- if (extractMetadataRegex.isNotEmpty()) {
- output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
- }
- if (texts[docId] == null) {
- tokens[docId]?.forEach { span ->
- val key = "${span.from}-${span.to}"
- val lemmaVal = morpho[docId]?.get(key)?.lemma
- output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
- }
- if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
- return output
- }
- tokens[docId]?.forEach { span ->
- token_index++
- if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
- if (output.isNotEmpty()) output.setCharAt(output.length - 1, '\n') else output.append("\n")
- if (extractMetadataRegex.isNotEmpty() && real_token_index < tokens[docId]!!.size - 1) {
- output.append(metadata[docId]?.joinToString("\t", postfix = "\t") ?: "")
- }
- sentence_index++
- }
- val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
- val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
- if (useLemma && morpho[docId] != null) {
- val key = "${span.from}-${span.to}"
- val lemmaVal = morpho[docId]!![key]?.lemma
- if (lemmaVal != null && lemmaVal != "_") {
- output.append(lemmaVal).append(' ')
- } else {
- texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
- output.append(' ')
- }
- } else {
- texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
- output.append(' ')
- }
- real_token_index++
- }
- if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
- return output
+ // Formatter-based output methods using modular formatters
+ private fun formatWord2VecOutput(docId: String): StringBuilder {
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = "base",
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ columns = columns
+ )
+ return de.ids_mannheim.korapxmltools.formatters.Word2VecFormatter.format(context)
}
- private fun nowOutput(docId: String): StringBuilder {
- var token_index = 0
- var real_token_index = 0
- var sentence_index = 0
- val output = StringBuilder()
-
- output.append("@@$docId ")
-
- if (texts[docId] == null) {
- tokens[docId]?.forEach { span ->
- if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
- if (output.isNotEmpty() && !output.endsWith("@@$docId ")) output.append(" <p> ")
- sentence_index++
- }
- val key = "${span.from}-${span.to}"
- val lemmaVal = morpho[docId]?.get(key)?.lemma
- output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
- }
- if (output.isNotEmpty() && output.endsWith(" ")) output.deleteCharAt(output.length - 1)
- return output
- }
-
- tokens[docId]?.forEach { span ->
- token_index++
- if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
- if (output.isNotEmpty() && !output.endsWith("@@$docId ")) output.append(" <p> ")
- sentence_index++
- }
- val safeFrom = span.from.coerceIn(0, texts[docId]!!.length)
- val safeTo = span.to.coerceIn(safeFrom, texts[docId]!!.length)
- if (useLemma && morpho[docId] != null) {
- val key = "${span.from}-${span.to}"
- val lemmaVal = morpho[docId]!![key]?.lemma
- if (lemmaVal != null && lemmaVal != "_") {
- output.append(lemmaVal).append(' ')
- } else {
- texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
- output.append(' ')
- }
- } else {
- texts[docId]!!.appendRangeTo(output, safeFrom, safeTo)
- output.append(' ')
- }
- real_token_index++
- }
- if (output.isNotEmpty() && output.endsWith(" ")) output.deleteCharAt(output.length - 1)
- return output
+ private fun formatNowOutput(docId: String): StringBuilder {
+ val context = de.ids_mannheim.korapxmltools.formatters.OutputContext(
+ docId = docId,
+ foundry = "base",
+ tokens = tokens[docId],
+ sentences = sentences[docId],
+ text = texts[docId],
+ morpho = morpho[docId],
+ metadata = metadata[docId],
+ extraFeatures = extraFeatures[docId],
+ fileName = fnames[docId],
+ useLemma = useLemma,
+ extractMetadataRegex = extractMetadataRegex,
+ columns = columns
+ )
+ return de.ids_mannheim.korapxmltools.formatters.NowFormatter.format(context)
}
private fun printConlluToken(
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/NowFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/NowFormatter.kt
new file mode 100644
index 0000000..5dcd8ac
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/NowFormatter.kt
@@ -0,0 +1,89 @@
+package de.ids_mannheim.korapxmltools.formatters
+
+/**
+ * Formatter for NOW corpus export format.
+ * Similar to Word2Vec but with:
+ * - Document ID prefix: @@<textSigle>
+ * - Sentence delimiter: <p>
+ * Can use lemmas instead of surface forms when available.
+ */
+object NowFormatter : OutputFormatter {
+ override val formatName: String = "now"
+
+ override fun format(context: OutputContext): StringBuilder {
+ var tokenIndex = 0
+ var realTokenIndex = 0
+ var sentenceIndex = 0
+ val output = StringBuilder()
+
+ // Prepend document ID
+ output.append("@@${context.docId} ")
+
+ // Handle case where text is not available (lemma-only mode)
+ if (context.text == null) {
+ context.tokens?.forEach { span ->
+ // Check for sentence boundaries
+ if (context.sentences != null &&
+ (sentenceIndex >= context.sentences.size || span.from >= context.sentences[sentenceIndex].to)) {
+ // Add sentence delimiter if not at start
+ if (output.isNotEmpty() && !output.endsWith("@@${context.docId} ")) {
+ output.append(" <p> ")
+ }
+ sentenceIndex++
+ }
+
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = context.morpho?.get(key)?.lemma
+ output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
+ }
+ if (output.isNotEmpty() && output.endsWith(" ")) {
+ output.deleteCharAt(output.length - 1)
+ }
+ return output
+ }
+
+ // Main processing with text available
+ context.tokens?.forEach { span ->
+ tokenIndex++
+
+ // Check if we're starting a new sentence
+ if (context.sentences != null &&
+ (sentenceIndex >= context.sentences.size || span.from >= context.sentences[sentenceIndex].to)) {
+ // Add sentence delimiter (but not at the very start after docId)
+ if (output.isNotEmpty() && !output.endsWith("@@${context.docId} ")) {
+ output.append(" <p> ")
+ }
+ sentenceIndex++
+ }
+
+ // Get safe text boundaries
+ val safeFrom = span.from.coerceIn(0, context.text.length)
+ val safeTo = span.to.coerceIn(safeFrom, context.text.length)
+
+ // Output lemma if available and requested, otherwise surface form
+ if (context.useLemma && context.morpho != null) {
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = context.morpho[key]?.lemma
+ if (lemmaVal != null && lemmaVal != "_") {
+ output.append(lemmaVal).append(' ')
+ } else {
+ // Fallback to surface form
+ context.text.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
+ }
+ } else {
+ context.text.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
+ }
+
+ realTokenIndex++
+ }
+
+ // Remove trailing space
+ if (output.isNotEmpty() && output.endsWith(" ")) {
+ output.deleteCharAt(output.length - 1)
+ }
+
+ return output
+ }
+}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt
new file mode 100644
index 0000000..584bf5c
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/OutputFormatter.kt
@@ -0,0 +1,38 @@
+package de.ids_mannheim.korapxmltools.formatters
+
+import de.ids_mannheim.korapxmltools.KorapXmlTool
+import de.ids_mannheim.korapxmltools.NonBmpString
+
+/**
+ * Common data structure passed to all output formatters.
+ * Contains all the document data that might be needed by any formatter.
+ */
+data class OutputContext(
+ val docId: String,
+ val foundry: String,
+ val tokens: Array<KorapXmlTool.Span>?,
+ val sentences: Array<KorapXmlTool.Span>?,
+ val text: NonBmpString?,
+ val morpho: MutableMap<String, KorapXmlTool.MorphoSpan>?,
+ val metadata: Array<String>?,
+ val extraFeatures: MutableMap<String, String>?,
+ val fileName: String?,
+ val useLemma: Boolean,
+ val extractMetadataRegex: List<String>,
+ val columns: Int = 10
+)
+
+/**
+ * Base interface for all output formatters.
+ */
+interface OutputFormatter {
+ /**
+ * Format the given document data and return the output as a StringBuilder.
+ */
+ fun format(context: OutputContext): StringBuilder
+
+ /**
+ * Get the name of this output format (e.g., "word2vec", "conllu")
+ */
+ val formatName: String
+}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt
new file mode 100644
index 0000000..c45883e
--- /dev/null
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/Word2VecFormatter.kt
@@ -0,0 +1,82 @@
+package de.ids_mannheim.korapxmltools.formatters
+
+/**
+ * Formatter for Word2Vec / language model training output format.
+ * Outputs tokens separated by spaces, sentences separated by newlines.
+ * Can use lemmas instead of surface forms when available.
+ */
+object Word2VecFormatter : OutputFormatter {
+ override val formatName: String = "word2vec"
+
+ override fun format(context: OutputContext): StringBuilder {
+ var tokenIndex = 0
+ var realTokenIndex = 0
+ var sentenceIndex = 0
+ val output = StringBuilder()
+
+ // Prepend metadata if requested
+ if (context.extractMetadataRegex.isNotEmpty()) {
+ output.append(context.metadata?.joinToString("\t", postfix = "\t") ?: "")
+ }
+
+ // Handle case where text is not available (lemma-only mode)
+ if (context.text == null) {
+ context.tokens?.forEach { span ->
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = context.morpho?.get(key)?.lemma
+ output.append((lemmaVal?.takeIf { it != "_" } ?: "_"), " ")
+ }
+ if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
+ return output
+ }
+
+ // Main processing with text available
+ context.tokens?.forEach { span ->
+ tokenIndex++
+
+ // Check if we're starting a new sentence
+ if (context.sentences != null &&
+ (sentenceIndex >= context.sentences.size || span.from >= context.sentences[sentenceIndex].to)) {
+ // Replace trailing space with newline to end previous sentence
+ if (output.isNotEmpty()) {
+ output.setCharAt(output.length - 1, '\n')
+ } else {
+ output.append("\n")
+ }
+
+ // Add metadata for new sentence if requested
+ if (context.extractMetadataRegex.isNotEmpty() && realTokenIndex < context.tokens.size - 1) {
+ output.append(context.metadata?.joinToString("\t", postfix = "\t") ?: "")
+ }
+ sentenceIndex++
+ }
+
+ // Get safe text boundaries
+ val safeFrom = span.from.coerceIn(0, context.text.length)
+ val safeTo = span.to.coerceIn(safeFrom, context.text.length)
+
+ // Output lemma if available and requested, otherwise surface form
+ if (context.useLemma && context.morpho != null) {
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = context.morpho[key]?.lemma
+ if (lemmaVal != null && lemmaVal != "_") {
+ output.append(lemmaVal).append(' ')
+ } else {
+ // Fallback to surface form
+ context.text.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
+ }
+ } else {
+ context.text.appendRangeTo(output, safeFrom, safeTo)
+ output.append(' ')
+ }
+
+ realTokenIndex++
+ }
+
+ // Remove trailing space
+ if (output.isNotEmpty()) output.deleteCharAt(output.length - 1)
+
+ return output
+ }
+}