Complete CoNLL-U output
Change-Id: I5bb862db6f76330e58a31c54f44c36c18b9ea1d8
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
index e49b01a..0f905d3 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxml2conllu/App.kt
@@ -97,6 +97,7 @@
synchronized(System.out) {
println("# foundry = base")
println("# filename = $tokens_fname")
+ println("# text_id = $docId")
printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
tokens[docId]?.forEach { span ->
token_index++
@@ -106,10 +107,13 @@
token_index = 1
printTokenOffsetsInSentence(sentences, docId, sentence_index, real_token_index, tokens)
}
- println("$token_index\t${span.from}\t${span.to}\t${sentences[docId]!![sentence_index].to}\t" + (texts[docId]?.substring(span.from, span.to) ?: ""))
+ printConlluToken(token_index, texts[docId]!!.substring(span.from, span.to) )
real_token_index++
}
+ arrayOf(tokens, texts, sentences).forEach { map ->
+ map.remove(docId)
+ }
}
}
@@ -124,6 +128,19 @@
}
}
+ private fun printConlluToken(
+ token_index: Int,
+ token: String,
+ lemma: String = "_",
+ upos: String = "_",
+ xpos: String = "_",
+ feats: String = "_",
+ head: String = "_",
+ deprel: String = "_",
+ deps: String = "_"
+ ) {
+ println("$token_index\t$token\t$lemma\t$upos\t$xpos\t$feats\t$head\t$deprel\t$deps")
+ }
private fun printTokenOffsetsInSentence(
sentences: ConcurrentHashMap<String, Array<Span>>,
docId: String,