Fix too many lines in tagger/parser conllu
Change-Id: I63775dc794a83675dd47ec01e3be66539a21d0a1
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationWorkerPool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationWorkerPool.kt
index 85e7ef3..8736ba7 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationWorkerPool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/AnnotationWorkerPool.kt
@@ -102,7 +102,7 @@
}
if (task.text == "#eof") {
try {
- outputStreamWriter.write("\n# eof\n") // Send EOF to process
+ outputStreamWriter.write("# eof\n") // Send EOF to process
outputStreamWriter.flush()
} catch (e: IOException) {
// Log error, but proceed to close
@@ -115,7 +115,12 @@
}
pendingTasks.put(task)
try {
- val dataToSend = task.text + "\n# eot\n\n"
+ val trimmed = task.text.trimEnd()
+ val dataToSend = if (trimmed.isEmpty()) {
+ "# eot\n"
+ } else {
+ trimmed + "\n\n# eot\n"
+ }
LOGGER.fine("Worker $workerIndex: Sending ${dataToSend.length} chars to external process")
LOGGER.finer("Worker $workerIndex: First 500 chars of data to send:\n${dataToSend.take(500)}")
outputStreamWriter.write(dataToSend)
@@ -228,10 +233,10 @@
lastLineWasEmpty = true
}
}
- else -> {
- output.append(line).append('\n')
- lastLineWasEmpty = false
- }
+ else -> {
+ output.append(line).append('\n')
+ lastLineWasEmpty = false
+ }
}
}
}
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/ConlluFormatter.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/ConlluFormatter.kt
index d406aba..d9a60fd 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/ConlluFormatter.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/formatters/ConlluFormatter.kt
@@ -99,7 +99,7 @@
var tokenText: String = if (textVal != null) {
val safeFrom = span.from.coerceIn(0, textVal.length)
val safeTo = span.to.coerceIn(safeFrom, textVal.length)
- textVal.substring(safeFrom, safeTo)
+ textVal.substring(safeFrom, safeTo).replace(Regex("[\\t\\n\\r]"), " ")
} else "_"
if (tokenText.isBlank()) {