Implement options -w, -p, -l
Change-Id: I8504e709f719bc31765c1710bb17caf830b1e426
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
index 32bc9c0..ae99794 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXml2Conllu.kt
@@ -3,7 +3,6 @@
import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.DocumentBuilderFactory
import java.io.InputStream
-import java.util.Arrays
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
@@ -19,8 +18,9 @@
import picocli.CommandLine.Command
import java.io.File
import java.io.InputStreamReader
-import java.util.HashMap
+import java.util.*
import java.util.concurrent.Callable
+import java.util.logging.Level
import java.util.logging.Logger
import kotlin.system.exitProcess
@@ -37,8 +37,8 @@
var zipFileNames: Array<String>? = null
@Option(names = ["--sigle-pattern", "-p"], paramLabel = "PATTERN",
- description = ["Not yet implemented: sigle pattern"])
- var siglePattern: String = ""
+ description = ["Extract only documents with sigle matching the pattern (regex)"])
+ var siglePattern: String? = null
@Option(names = ["--extract-attributes-regex", "-e"], paramLabel = "REGEX",
description = ["Not yet implemented: extract attributes regex"])
@@ -49,14 +49,15 @@
var sBoundsFromMorpho: Boolean = false
@Option(names = ["--log", "-l"], paramLabel = "LEVEL",
- description = ["Not yet implemented: log level"])
- var logLevel: String = "warn"
+ description = ["Log level: one of SEVERE, WARNING, INFO, FINE, FINER, FINEST. Default: ${"$"}{DEFAULT-VALUE}])"])
+ var logLevel: String = "WARNING"
@Option(names = ["--columns", "-c"], paramLabel = "NUMBER",
description = ["Not yet implemented: columns"])
var columns: Int = 10
- @Option(names = ["--word2vec", "-w"], description = ["Not yet implemented: word2vec"])
+ @Option(names = ["--word2vec", "-w"], description = ["Print text in LM training format: tokens " +
+ "separated by space, sentences separated by newline"])
var lmTrainingData: Boolean = false
@Option(names = ["--token-separator", "-s"], paramLabel = "SEPARATOR",
@@ -74,8 +75,16 @@
var extractMetadataRegex: MutableList<String> = mutableListOf()
override fun call(): Int {
+ LOGGER.level = try {
+ Level.parse(logLevel.uppercase(Locale.getDefault()))
+ } catch (e: IllegalArgumentException) {
+ LOGGER.warning("Invalid log level: $logLevel. Defaulting to WARNING.")
+ Level.WARNING
+ }
+
LOGGER.info("Processing zip files: " + zipFileNames!!.joinToString(", "))
- korapxml2conllu(zipFileNames!!)// Your application logic here
+
+ korapxml2conllu(zipFileNames!!)
return 0
}
private val LOGGER: Logger = Logger.getLogger(KorapXml2Conllu::class.java.name)
@@ -88,7 +97,7 @@
val morpho: ConcurrentHashMap<String, MutableMap<String, MorphoSpan>> = ConcurrentHashMap()
val fnames: ConcurrentHashMap<String, String> = ConcurrentHashMap()
- if (args == null || args.isEmpty() || args[0] == null) {
+ if (args.isEmpty()) {
LOGGER.severe("Usage: KorapXml2Conllu <zipfile1> [<zipfile2> ...]")
return
}
@@ -162,7 +171,9 @@
doc.documentElement.normalize()
val docId: String = doc.documentElement.getAttribute("docid")
-
+ if (siglePattern != null && !docId.matches(Regex(siglePattern!!))) {
+ return@forEach
+ }
// LOGGER.info("Processing file: " + zipEntry.getName())
val fileName =
zipEntry.name.replace(Regex(".*?/([^/]+\\.xml)$"), "$1")
@@ -201,52 +212,64 @@
&& (!waitForMorpho || morpho[docId] != null)
) {
synchronized(System.out) {
- println("# foundry = $foundry")
- println("# filename = ${fname[docId]}")
- println("# text_id = $docId")
- printTokenOffsetsInSentence(
- sentences,
- docId,
- sentence_index,
- real_token_index,
- tokens
- )
- tokens[docId]?.forEach { span ->
- token_index++
- if (span.from >= sentences[docId]!![sentence_index].to) {
- println()
- sentence_index++
- token_index = 1
- printTokenOffsetsInSentence(
- sentences,
- docId,
- sentence_index,
- real_token_index,
- tokens
- )
+ if (lmTrainingData) {
+ tokens[docId]?.forEach { span ->
+ token_index++
+ if (span.from >= sentences[docId]!![sentence_index].to) {
+ println()
+ sentence_index++
+ }
+ print(texts[docId]!!.substring(span.from, span.to)+ " ")
+ real_token_index++
}
- if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
- val mfs = morpho[docId]!!["${span.from}-${span.to}"]
- printConlluToken(
- token_index,
- texts[docId]!!.substring(span.from, span.to),
- mfs!!.lemma!!,
- mfs.upos!!,
- mfs.xpos!!,
- mfs.feats!!,
- mfs.head!!,
- mfs.deprel!!,
- mfs.deps!!,
- mfs.misc!!
- )
- } else {
- printConlluToken(
- token_index, texts[docId]!!.substring(span.from, span.to)
- )
- }
- real_token_index++
+ } else {
+ println("# foundry = $foundry")
+ println("# filename = ${fname[docId]}")
+ println("# text_id = $docId")
+ printTokenOffsetsInSentence(
+ sentences,
+ docId,
+ sentence_index,
+ real_token_index,
+ tokens
+ )
+ tokens[docId]?.forEach { span ->
+ token_index++
+ if (span.from >= sentences[docId]!![sentence_index].to) {
+ println()
+ sentence_index++
+ token_index = 1
+ printTokenOffsetsInSentence(
+ sentences,
+ docId,
+ sentence_index,
+ real_token_index,
+ tokens
+ )
+ }
+ if (waitForMorpho && morpho[docId]?.containsKey("${span.from}-${span.to}") == true) {
+ val mfs = morpho[docId]!!["${span.from}-${span.to}"]
+ printConlluToken(
+ token_index,
+ texts[docId]!!.substring(span.from, span.to),
+ mfs!!.lemma!!,
+ mfs.upos!!,
+ mfs.xpos!!,
+ mfs.feats!!,
+ mfs.head!!,
+ mfs.deprel!!,
+ mfs.deps!!,
+ mfs.misc!!
+ )
+ } else {
+ printConlluToken(
+ token_index, texts[docId]!!.substring(span.from, span.to)
+ )
+ }
+ real_token_index++
- }
+ }
+ }
arrayOf(tokens, texts, sentences, morpho).forEach { map ->
map.remove(docId)
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
index 9ec3d5b..3efbb6a 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXml2ConlluTest.kt
@@ -8,6 +8,7 @@
import kotlin.test.Test
import kotlin.test.assertContains
import org.junit.Ignore
+import kotlin.test.assertFalse
class KorapXml2ConlluTest {
private val outContent = ByteArrayOutputStream(10000000)
@@ -100,6 +101,38 @@
)
}
+ @Test
+ fun respectsSiglePattern() {
+ val args = arrayOf("-p",".*7", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "# text_id = WDF19_A0000.14247"
+ )
+ assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+ }
+
+ @Test
+ fun w2vOptionWorks() {
+ val args = arrayOf("-w", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(
+ outContent.toString(),
+ "\nje ne suis pas du tout d'accord ! \n"
+ )
+ assertFalse { outContent.toString().contains("WDF19_A0000.13865") }
+ }
+
+ @Test
+ fun canSetLogLevel() {
+ val args = arrayOf("-l", "info", loadResource("wdf19.zip").path)
+ debug(args)
+ assertContains(
+ errContent.toString(),
+ "Processing zip file"
+ )
+ }
+
@Ignore("for some reason not working")
fun canConvertMorphoFeatureAnnotations() {
val args = arrayOf(goe, goeMarmot)