Add NOW corpus export format
One text per line.
Change-Id: I559dd27f55bb90a10f86e10b1d9c4d74536148b6
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 9932cc0..100d78d 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -61,10 +61,11 @@
@Option(
names = ["-f", "--output-format"],
- description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}",
+ description = ["Output format: ${ConlluOutputFormat.NAME}, ${Word2VecOutputFormat.NAME}, ${KorapXmlOutputFormat.NAME}, ${NowOutputFormat.NAME}",
"conllu: CoNLL-U format",
"korapxml, xml, zip: KorAP-XML format zip",
"word2vec, w2v: Print text in LM training format: tokens separated by space, sentences separated by newlines",
+ "now, NOW: NOW corpus export format: w2v-like format with <p> tags for sentence ends and @@<text-sigle> prefix",
],
converter = [OutputFormatConverter::class]
)
@@ -75,6 +76,7 @@
"conllu", "conll" -> OutputFormat.CONLLU
"word2vec", "w2v" -> OutputFormat.WORD2VEC
"korapxml", "korap", "xml", "zip" -> OutputFormat.KORAPXML
+ "now", "NOW" -> OutputFormat.NOW
else -> throw IllegalArgumentException("Unknown output format: `$value'. Use one of: ${OutputFormat.entries.joinToString(", ") { it.name }}")
}
}
@@ -131,7 +133,7 @@
defaultValue = "\n",
description = ["Token separator. Default: new-line for CoNLL-U, space for word2vec format."]
)
- var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC) " " else "\n"
+ var tokenSeparator: String = if (outputFormat == OutputFormat.WORD2VEC || outputFormat == OutputFormat.NOW) " " else "\n"
@Option(names = ["--offsets"], description = ["Not yet implemented: offsets"])
var offsets: Boolean = false
@@ -527,6 +529,8 @@
val output =
if (outputFormat == OutputFormat.WORD2VEC) {
lmTrainingOutput(docId)
+ } else if (outputFormat == OutputFormat.NOW) {
+ nowOutput(docId)
} else {
if (taggerToolBridges[Thread.currentThread().id] != null) {
morpho[docId] = taggerToolBridges[Thread.currentThread().id]!!.tagText(
@@ -864,6 +868,40 @@
return output
}
+ private fun nowOutput(docId: String): StringBuilder {
+ var token_index = 0
+ var real_token_index = 0
+ var sentence_index = 0
+ val output: StringBuilder = StringBuilder()
+
+ // Add the text sigle prefix
+ output.append("@@$docId ")
+
+ if (texts[docId] == null) {
+ return output
+ }
+
+ tokens[docId]?.forEach { span ->
+ token_index++
+ if (sentences[docId] != null && (sentence_index >= sentences[docId]!!.size || span.from >= sentences[docId]!![sentence_index].to)) {
+ // Replace sentence end with <p> tag instead of newline
+ if (output.isNotEmpty() && !output.endsWith("@@$docId ")) {
+ output.append(" <p> ")
+ }
+ sentence_index++
+ }
+ output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ real_token_index++
+ }
+
+ // Remove trailing space and add final newline
+ if (output.isNotEmpty() && output.endsWith(" ")) {
+ output.deleteCharAt(output.length - 1)
+ }
+
+ return output
+ }
+
private fun printConlluToken(
token_index: Int,
@@ -1033,7 +1071,7 @@
}
enum class OutputFormat {
- CONLLU, WORD2VEC, KORAPXML
+ CONLLU, WORD2VEC, KORAPXML, NOW
}
object ConlluOutputFormat {
@@ -1048,4 +1086,8 @@
const val NAME = "korapxml"
}
+object NowOutputFormat {
+ const val NAME = "now"
+}
+
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index 288f4e5..c9d41d6 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -159,6 +159,24 @@
}
@Test
+ fun nowOptionWorks() {
+ val args = arrayOf("-f", "now", loadResource("wdf19.zip").path)
+ debug(args)
+ val output = outContent.toString()
+ // Check that output starts with @@<text-sigle>
+ assertContains(output, "@@WDF19_A0000.")
+ // Check that sentence boundaries are replaced with <p> tags
+ assertContains(output, " <p> ")
+ // Check that it contains the expected text content
+ assertContains(output, "Arts visuels Pourquoi toujours vouloir")
+ // Check that it doesn't contain CoNLL-U format markers
+ assertFalse(output.contains("# foundry"))
+ // Check that each text is on one line (no newlines within text except at end)
+ val lines = output.trim().split('\n')
+ assertTrue(lines.all { it.startsWith("@@") })
+ }
+
+ @Test
fun canConvertXMLwithInvalidComments() {
val args = arrayOf("-w", zca20scrambled)
debug(args)