Add FilterKeys script and Readme

Change-Id: I69dbee2fbf7cce2afbcbc4deced635e62886a897
diff --git a/Readme.md b/Readme.md
new file mode 100644
index 0000000..10e45c5
--- /dev/null
+++ b/Readme.md
@@ -0,0 +1,101 @@
+# totalngrams
+
+Package for effectively processing frequency lists from very large corpora in tab separated value format, 
+by making full use of multicore-processors.
+
+An older version of `totalngrams` was used for Koplenig et al. (2022).
+
+## Synopsis
+
+```plain
+totalngrams [-dhlNSV] [--force] [--pad] [-f=<fold>] [-F=<FOLDS>]
+                   [-L=<logFileName>] [-n=<ngram_size>] [-o=<output_fillename>]
+                   [-p=<worker_pool_specification>] [-P=<max_threads>]
+                   <inputFiles>...
+sum ngram counts from KorAP-XML, CoNLL-U files and frequency lists
+      <inputFiles>...    input files
+  -d, --downcase         Convert all token characters into lower case (default:
+                           false)
+  -f, --fold=<fold>      current fold (default: 1)
+  -F, --folds=<FOLDS>    number of random folds (default: 1)
+      --force            Force overwrite (default: false)
+  -h, --help             Show this help message and exit.
+  -l, --with-lemma-pos   Use also lemma and part-of-speech annotations
+                           (default: false
+  -L, --log-file=<logFileName>
+                         log file name (default: totalngrams.log)
+  -n, --ngram-size=<ngram_size>
+                         n-gram size (default: 1)
+  -N, --numeric-secondary-sort
+                         Sort entries with same frequency numerically
+                           (default: false)
+  -o, --output-file=<output_fillename>
+                         Output file (default: -)
+  -p, --worker-pool=<worker_pool_specification>
+                         Run preprocessing on extern hosts, e.g. '10*local,
+                           5*host1,3*smith@host2' (default: )
+  -P, --max-procs=<max_threads>
+                         Run up to max-procs processes at a time (default: 6)
+      --pad              Add padding «START» and «END» symbols at text edges
+                           (default: false)
+  -S, --sort             Toggle output sorting (default: true)
+  -V, --version          Print version information and exit.
+
+```
+
+## Scripts
+
+The package also contains some groovy scripts for handling *pseudonymization* tasks, i.e. replacing
+each token or lemma with a corresponding number according to separate key files.
+
+You can run the groovy scripts directly, if you have installed groovy or from the totalngrams jar, otjherwise.
+
+### GeneratePseudonymKey
+
+#### Example usage
+
+```bash
+./src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy -h
+```
+
+```bash
+java -Dgroovy.grape.enable=false -cp target/totalngrams-2.1.0.jar\
+ org.ids_mannheim.GeneratePseudonymKey -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz
+
+java -Dgroovy.grape.enable=false -cp target/totalngrams-2.1.0.jar\
+ org.ids_mannheim.GeneratePseudonymKey -c 1 1-gram-token-l-freqs.*.tsv.xz
+```
+
+### Pseudonymize
+
+#### Example usage
+
+```
+java -Dgroovy.grape.enable=false -cp totalngrams-2.1.0.jar org.ids_mannheim.Pseudonymize
+```
+
+### FilterKeys
+
+#### Example usage
+
+```
+java -Xmx160000m -Dgroovy.grape.enable=false -cp totalngrams-2.1.0.jar org.ids_mannheim.FilterKeys\
+ -k token_keys.tsv.xz -k lemma_keys.tsv.xz 1-gram-token-l-freqs.*.tsv.xz
+```
+
+# Installation
+
+### Prerequisites
+
+* Java Development Kit (JDK) >= 18
+* [Apache Maven](https://maven.apache.org/)
+
+```bash
+git clone "https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/totalngrams"
+cd totalngrams
+mvn install
+```
+
+# References
+
+* Koplenig, Alexander/Kupietz, Marc/Wolfer, Sascha (2022): [Testing the relationship between word length, frequency, and predictability based on the German Reference Corpus](http://dx.doi.org/10.1111/cogs.13090). Cognitive Science 46(6)
diff --git a/src/main/groovy/org/ids_mannheim/FilterKeys.groovy b/src/main/groovy/org/ids_mannheim/FilterKeys.groovy
new file mode 100755
index 0000000..d561b02
--- /dev/null
+++ b/src/main/groovy/org/ids_mannheim/FilterKeys.groovy
@@ -0,0 +1,130 @@
+#!/bin/env groovy
+
+package org.ids_mannheim
+
+import groovy.cli.Option
+import groovy.cli.Unparsed
+import groovy.cli.picocli.CliBuilder
+@GrabConfig(systemClassLoader = true)
+@Grab('org.tukaani:xz:1.9')
+// should be imported even if not used directly
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+@Grab('info.picocli:picocli:4.6.3')
+
+import groovyx.gpars.GParsPool
+
+// @GrabConfig(systemProperties = "groovy.grape.enable=false")
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+import java.util.logging.Logger
+
+class FilterKeys {
+
+    static tag = "FilterKeys"
+    static final int maxSpecialKeys = 1000
+
+    static interface pseudonymizeArgs {
+        @Option(shortName = 'k', description = "pseudonymization key, use multiple times for multiple keys")
+        String[] keys()
+
+        @Option(shortName = 'h', description = "print this help message")
+        boolean help()
+
+        @Unparsed(description = "pseudonymized frequency lists to compute essential keys for")
+        List files()
+    }
+
+    static def compressorOutputStream(fname) {
+        def compresserTypes = ["gz": CompressorStreamFactory.GZIP, "xz": CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
+        String extension = fname.substring(fname.lastIndexOf(".") + 1)
+        def type = compresserTypes[extension]
+        if (type) return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname))))) else return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
+    }
+
+    static def readKey(fname, log) {
+        def myArray = new ArrayList<String>()
+        log.info("Reading key ${fname} ...")
+        def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+        input.splitEachLine("\t") { fields -> myArray[Integer.valueOf(fields[fields.size() - 1]) + maxSpecialKeys] = fields[0] }
+        log.info("Done reading key ${fname}.")
+        return myArray
+    }
+
+
+    static void main(String[] args) {
+        System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+        Logger log = Logger.getLogger("")
+
+        CliBuilder cli = new CliBuilder(name: tag, width: 100)
+
+        def options = cli.parseFromSpec(pseudonymizeArgs, args)
+
+        if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+            cli.usage()
+            System.exit(-1)
+        }
+
+        filterKeys(options.files(), options.keys(), log)
+    }
+
+    static String replaceDot(a, replaceWith) {
+        retun a.substring(0, a.lastIndexOf(",")) + replaceWith + a.substring(a.lastIndexOf(",") + 1)
+    }
+
+    static String foldKeyName(fname, keyname, Logger log) {
+        def fold = fname.replaceAll(/.*[^0-9]([0-9]+)\..*$/, '$1')
+        if (fold == "") {
+            log.severe("File ${fname} contains no fold number")
+            System.exit(-1)
+        }
+        return (keyname.replaceFirst(/\./, "." + fold + '.'))
+    }
+
+    static void filterKeys(List<String> files, keys, Logger log) {
+        log.info("Filtering ${keys.size()} keys")
+        def keyArrays = []
+
+        keys.each { fname -> keyArrays << readKey(fname, log) }
+
+        GParsPool.withPool {
+            files.eachParallel(fname -> {
+                def myKeys = []
+                keys.each { myKeys << new ArrayList<Boolean>() }
+
+                def outNames = keys.collect { foldKeyName(fname, it, log) }
+                if (outNames.any { new File(it).exists() }) {
+                    log.warning("One of ${outNames} already exists - skipping fold")
+                } else {
+                    log.info("Filtering ${fname} to ${outNames}")
+                    def output_streams = outNames.collect { compressorOutputStream(it) }
+                    def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+
+                    input.splitEachLine("\t") { fields ->
+                        if (fields.size() >= keyArrays.size()) {
+                            for (int i in 0..(fields.size() - 2)) {
+                                if (i % 3 < keyArrays.size()) {
+                                    myKeys[i % 3][Integer.valueOf(fields[i]) + maxSpecialKeys] = true
+                                }
+                            }
+                        }
+                    }
+                    log.info("Done filtering ${fname}")
+                    GParsPool.withPool {
+                        output_streams.eachWithIndexParallel { PrintStream outStream, int i ->
+                            log.info("Printing ${outNames[i]}")
+
+                            for (int k : 0..myKeys[i].size()) {
+                                if (myKeys[i][k]) outStream.println("${keyArrays[i][k] - maxSpecialKeys}\t${k - maxSpecialKeys}")
+                            }
+                            outStream.close()
+                            log.info("Done printing ${outNames[i]}")
+                        }
+                    }
+                }
+            })
+        }
+    }
+
+}
diff --git a/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
index 4c5397c..3a0d415 100755
--- a/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
+++ b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
@@ -26,20 +26,21 @@
         @Option(shortName = 'c', defaultValue = "0", description = 'generate pseudonyms for column n')
         int column()
 
-        @Option(shortName = 's', defaultValue = "«END»,«START»", description = "comma separated special keys (will get pseudonyms -n..-1)")
+        @Option(shortName = 's', defaultValue = "«END»,«START»", description = "comma separated special keys (default: \"«END»,«START»\"), which will get pseudonyms -n..-1")
         String specialKeys()
 
         @Option(shortName = 'h')
         boolean help()
 
-        @Unparsed
+        @Unparsed(description = "tsv formatted frequency lists to be pseudonymized")
         String[] files()
     }
 
     static void main(String[] args) {
-        CliBuilder cli = new CliBuilder(name: "${tag}", width: 120, footer: "Examples:\n" +
-                "generate_pseudonym_key.groovy -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz\n" +
-                "generate_pseudonym_key.groovy -c 1 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > lemma_key.tsv.xz\n")
+        CliBuilder cli = new CliBuilder(name: "${tag}", width: 120, footer: "\nOutput is written to stdout.\n\n" +
+                "Examples:\n" +
+                "${tag} -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz\n" +
+                "${tag} -c 1 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > lemma_key.tsv.xz\n")
 
         def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args)
 
diff --git a/src/test/resources/pseudo_token_lemma_pos.1.tsv.xz b/src/test/resources/pseudo_token_lemma_pos.1.tsv.xz
new file mode 100644
index 0000000..adc8bec
--- /dev/null
+++ b/src/test/resources/pseudo_token_lemma_pos.1.tsv.xz
Binary files differ