Make groovy scripts available as executables
Change-Id: I2d3316769de8453e34656c32d66574c3f6085804
diff --git a/Readme.md b/Readme.md
index 10e45c5..dc4d6e3 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,6 +1,6 @@
# totalngrams
-Package for effectively processing frequency lists from very large corpora in tab separated value format,
+Package for effectively processing token lists from very large corpora in tab separated value format,
by making full use of multicore-processors.
An older version of `totalngrams` was used for Koplenig et al. (2022).
@@ -42,7 +42,26 @@
-V, --version Print version information and exit.
```
+#### Example usage
+```bash
+FOLDS=16
+BASE="."
+for l in "-l"; do # "-l"
+ for n in $(seq 1 2 3); do
+ for f in $(seq 1 $FOLDS); do
+ totalngrams\
+ --pad \
+ -P 79 \
+ -n $n \
+ -f $f \
+ -F $FOLDS \
+ $l -o "$BASE/paddedlemmaposfreq/$n-gram-token$l-freqs.$f.tsv.xz" $BASE/conllu/*.conllu.gz
+ done
+ done
+done
+
+```
## Scripts
The package also contains some groovy scripts for handling *pseudonymization* tasks, i.e. replacing
@@ -58,12 +77,10 @@
./src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy -h
```
-```bash
-java -Dgroovy.grape.enable=false -cp target/totalngrams-2.1.0.jar\
- org.ids_mannheim.GeneratePseudonymKey -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz
+or:
-java -Dgroovy.grape.enable=false -cp target/totalngrams-2.1.0.jar\
- org.ids_mannheim.GeneratePseudonymKey -c 1 1-gram-token-l-freqs.*.tsv.xz
+```bash
+generate_pseudonym_key -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz
```
### Pseudonymize
@@ -71,7 +88,7 @@
#### Example usage
```
-java -Dgroovy.grape.enable=false -cp totalngrams-2.1.0.jar org.ids_mannheim.Pseudonymize
+pseudonymize -d /tmp -k tokens_key.tsv.xz -k lemma_key.tsv.xz *-gram-token-l-freqs.*.tsv.xz
```
### FilterKeys
@@ -79,21 +96,22 @@
#### Example usage
```
-java -Xmx160000m -Dgroovy.grape.enable=false -cp totalngrams-2.1.0.jar org.ids_mannheim.FilterKeys\
- -k token_keys.tsv.xz -k lemma_keys.tsv.xz 1-gram-token-l-freqs.*.tsv.xz
+filter_keys -k token_keys.tsv.xz -k lemma_keys.tsv.xz 1-gram-token-l-freqs.*.tsv.xz
```
# Installation
### Prerequisites
-* Java Development Kit (JDK) >= 18
+* [Java Development Kit (JDK) >= 18](https://www.oracle.com/de/java/technologies/downloads/)
* [Apache Maven](https://maven.apache.org/)
+### Install
```bash
git clone "https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/totalngrams"
cd totalngrams
mvn install
+export PATH=`pwd`/appassembler/bin:$PATH
```
# References
diff --git a/pom.xml b/pom.xml
index 5f0531a..6a16bc1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -159,7 +159,7 @@
</executions>
<configuration>
<binFolder>bin</binFolder>
- <extraJvmArguments>-Xmx512000m</extraJvmArguments>
+ <extraJvmArguments>-XX:MaxRAMPercentage=85 -Dgroovy.grape.enable=false</extraJvmArguments>
<binFileExtensions>
<unix/>
</binFileExtensions>
@@ -168,6 +168,18 @@
<mainClass>org.ids_mannheim.TotalNGrams</mainClass>
<id>totalngrams</id>
</program>
+ <program>
+ <mainClass>org.ids_mannheim.GeneratePseudonymKey</mainClass>
+ <id>generate_pseudonym_key</id>
+ </program>
+ <program>
+ <mainClass>org.ids_mannheim.Pseudonymize</mainClass>
+ <id>pseudonymize</id>
+ </program>
+ <program>
+ <mainClass>org.ids_mannheim.FilterKeys</mainClass>
+ <id>filter_keys</id>
+ </program>
</programs>
</configuration>
</plugin>
diff --git a/src/main/groovy/org/ids_mannheim/FilterKeys.groovy b/src/main/groovy/org/ids_mannheim/FilterKeys.groovy
index d561b02..1852f2d 100755
--- a/src/main/groovy/org/ids_mannheim/FilterKeys.groovy
+++ b/src/main/groovy/org/ids_mannheim/FilterKeys.groovy
@@ -22,11 +22,11 @@
class FilterKeys {
- static tag = "FilterKeys"
+ static tag = (System.getProperty("app.name") ? System.getProperty("app.name") : new File(this.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()).getName())
static final int maxSpecialKeys = 1000
static interface pseudonymizeArgs {
- @Option(shortName = 'k', description = "pseudonymization key, use multiple times for multiple keys")
+ @Option(shortName = 'k', description = "pseudonymization key(s), use multiple times for multiple keys\n(e.g. `-k tokenkey.tsv -k lemmakey.tsv`)")
String[] keys()
@Option(shortName = 'h', description = "print this help message")
diff --git a/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
index 3a0d415..f691511 100755
--- a/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
+++ b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
@@ -20,7 +20,7 @@
import java.util.logging.Logger
class GeneratePseudonymKey {
- static tag = "GeneratePseudonymKey"
+ static tag = (System.getProperty("app.name") ? System.getProperty("app.name") : new File(this.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()).getName())
static interface GeneratePseudonymKeyArgs {
@Option(shortName = 'c', defaultValue = "0", description = 'generate pseudonyms for column n')
@@ -32,7 +32,7 @@
@Option(shortName = 'h')
boolean help()
- @Unparsed(description = "tsv formatted frequency lists to be pseudonymized")
+ @Unparsed(description = "tsv-formatted frequency lists, on the basis of which pseudonym keys are calculated")
String[] files()
}
diff --git a/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy b/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy
index f1c1196..b7aa909 100755
--- a/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy
+++ b/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy
@@ -21,20 +21,19 @@
import java.util.logging.Logger
class Pseudonymize {
-
- static tag = "Pseudonymize"
+ static tag = (System.getProperty("app.name") ? System.getProperty("app.name") : new File(this.getProtectionDomain().getCodeSource().getLocation().toURI().getPath()).getName())
static interface pseudonymizeArgs {
- @Option(shortName = 'k')
+ @Option(shortName = 'k', description = "pseudonymization key(s), use multiple times for multiple keys\n(e.g. `-k tokenkey.tsv -k lemmakey.tsv`)")
String[] keys()
- @Option(shortName = 'd', defaultValue = "./")
+ @Option(shortName = 'd', defaultValue = ".", description = "where to write the results")
String destPath()
@Option(shortName = 'h')
boolean help()
- @Unparsed()
+ @Unparsed(description = "tsv-formatted frequency lists to be pseudonymized")
List files()
}