Add lemma option to print lemmas in w2v/now exports
Change-Id: If6f99edb6840088b12507010534a75be59d24a20
diff --git a/Readme.md b/Readme.md
index 2dfcda4..2a3d6d4 100644
--- a/Readme.md
+++ b/Readme.md
@@ -68,26 +68,40 @@
java -jar korapxmltool.jar -f now /vol/corpora/DeReKo/current/KorAP/zip/*24.zip | pv > dach24.txt
```
+### Using lemmas instead of surface forms in word2vec / NOW output
+
+If lemma annotations (morpho layer) are present alongside the base tokens, you can output lemmas instead of surface tokens with `--lemma`.
+
+```shell script
+# Word2Vec style output with lemmas where available
+java -jar ./app/build/libs/korapxmltool.jar --lemma -f w2v app/src/test/resources/goe.tree_tagger.zip | head -3
+
+# NOW corpus style output with lemmas
+java -jar ./app/build/libs/korapxmltool.jar --lemma -f now app/src/test/resources/goe.tree_tagger.zip | head -1
+```
+
+If a lemma for a token is missing (`_`) the surface form is used as fallback.
+
## Annotation
### Tagging with integrated MarMoT POS tagger directly to a new KorAP-XML ZIP file
-You need to download the pre-trained MarMoT models from the [here](http://cistern.cis.lmu.de/marmot/models/CURRENT/).
+You need to download the pre-trained MarMoT models from the [MarMoT models repository](http://cistern.cis.lmu.de/marmot/models/CURRENT/).
```shell script
-$ java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
+java -jar ./app/build/libs/korapxmltool.jar -f zip -t marmot:models/de.marmot app/src/test/resources/goe.zip
```
### Tagging with integrated OpenNLP POS tagger directly to a new KorAP-XML ZIP file
-You need to download the pre-trained OpenNLP models from [here](https://opennlp.apache.org/models.html#part_of_speech_tagging) or older models from [here](http://opennlp.sourceforge.net/models-1.5/).
+You need to download the pre-trained OpenNLP models from the [OpenNLP model download page](https://opennlp.apache.org/models.html#part_of_speech_tagging) or older models from the [legacy OpenNLP models archive](http://opennlp.sourceforge.net/models-1.5/).
```shell script
java -jar ./app/build/libs/korapxmltool.jar -f zip -t opennlp:/usr/local/kl/korap/Ingestion/lib/models/opennlp/de-pos-maxent.bin /tmp/zca24.zip
```
### Tag and lemmatize with TreeTagger
-This requires the [TreeTagger Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/CoNLL-U-Treetagger).
+This requires the [TreeTagger Docker Image with CoNLL-U Support](https://gitlab.ids-mannheim.de/KorAP/CoNLL-U-Treetagger).
Language models are downloaded automatically.
```shell script
@@ -106,7 +120,7 @@
### Using the integrated Maltparser
-You need to download the pre-trained MaltParser models from the [here](http://www.maltparser.org/mco/mco.html).
+You need to download the pre-trained MaltParser models from the [MaltParser model repository](http://www.maltparser.org/mco/mco.html).
Note that parsers take POS tagged input.
```shell script
diff --git a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
index 100d78d..4e592cc 100644
--- a/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
+++ b/app/src/main/kotlin/de/ids_mannheim/korapxmltools/KorapXmlTool.kt
@@ -175,6 +175,12 @@
)
var overwrite: Boolean = false
+ @Option(
+ names = ["--lemma"],
+ description = ["In word2vec/now output modes, output lemmas instead of surface tokens when lemma annotations are available (requires corresponding morpho annotation XML)"]
+ )
+ var useLemma: Boolean = false
+
private var taggerName: String? = null
private var taggerModel: String? = null
@Option(
@@ -859,7 +865,17 @@
}
sentence_index++
}
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ if (useLemma && morpho[docId] != null) {
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = morpho[docId]!![key]?.lemma
+ if (lemmaVal != null && lemmaVal != "_") {
+ output.append(lemmaVal, " ")
+ } else {
+ output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ }
+ } else {
+ output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ }
real_token_index++
}
if (output.isNotEmpty()) {
@@ -890,7 +906,17 @@
}
sentence_index++
}
- output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ if (useLemma && morpho[docId] != null) {
+ val key = "${span.from}-${span.to}"
+ val lemmaVal = morpho[docId]!![key]?.lemma
+ if (lemmaVal != null && lemmaVal != "_") {
+ output.append(lemmaVal, " ")
+ } else {
+ output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ }
+ } else {
+ output.append(texts[docId]!!.substring(span.from, span.to), " ")
+ }
real_token_index++
}
diff --git a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
index c9d41d6..55eaf25 100644
--- a/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
+++ b/app/src/test/kotlin/de/ids_mannheim/korapxmltools/KorapXmlToolTest.kt
@@ -295,4 +295,23 @@
assert(File(resultFile).exists())
assert(File(resultFile).length() > 0)
}
+
+ @Test
+ fun canWord2VecLemma() {
+ val args = arrayOf("--lemma", "-f", "w2v", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ // Expect lemma sequence containing "mein Ankunft" (surface would include inflected form elsewhere)
+ assertContains(out, " mein Ankunft ")
+ }
+
+ @Test
+ fun canNowLemma() {
+ val args = arrayOf("--lemma", "-f", "now", loadResource("goe.tree_tagger.zip").path)
+ debug(args)
+ val out = outContent.toString()
+ assertContains(out, "@@")
+ assertContains(out, " <p> ")
+ assertContains(out, " mein Ankunft ")
+ }
}