Add groovy script for pseudonymizing ngram lists

Change-Id: Ic141f32d683d8672136d0612e9fd639069e40413
diff --git a/scripts/pseudonymize.groovy b/scripts/pseudonymize.groovy
new file mode 100755
index 0000000..70447fd
--- /dev/null
+++ b/scripts/pseudonymize.groovy
@@ -0,0 +1,95 @@
+#!/bin/env groovy
+@Grab('info.picocli:picocli-groovy:4.6.3')
+@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+import groovy.cli.Option
+import groovy.cli.Unparsed
+import groovy.cli.commons.CliBuilder
+import groovyx.gpars.GParsPool
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.logging.Logger
+
+def tag = "pseudonymize"
+
+interface pseudonymizeArgs {
+    @Option(shortName = 'k')
+    String[] keys()
+
+    @Option(shortName = 'd')
+    String destPath()
+
+    @Option(shortName = 'h', defaultValue = '0')
+    boolean help()
+
+    @Unparsed
+    List files()
+}
+
+def compressorOutputStream(fname) {
+    def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
+    String extension = fname.substring(fname.lastIndexOf(".") + 1)
+    def type = compresserTypes[extension]
+    if (type)
+        return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
+    else
+        return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
+}
+
+static def readKey(fname, log) {
+    def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
+    log.info("Reading key ${fname} ...")
+    def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+    input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
+    log.info("Done reading key ${fname}.")
+    return myMap
+}
+
+CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
+def options = cli.parseFromSpec(pseudonymizeArgs, args)
+
+if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+    cli.usage()
+    System.exit(-1)
+}
+
+System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+Logger log = Logger.getLogger("")
+log.info("Pseudonymizing ${options.keys().size()} columns")
+def keyMaps = []
+
+options.keys().each { fname -> keyMaps << readKey(fname, log) }
+
+GParsPool.withPool {
+    options.files().eachParallel(fname -> {
+        def outName = options.destPath() + "/" + new File(fname).getName()
+        def outFile = new File(outName)
+        if (outFile.exists()) {
+            log.warning("${outName} already exists - skipping")
+        } else {
+            log.info("Pseudonymizing ${fname} to ${outName}")
+            def output_stream = compressorOutputStream(outName)
+            def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+
+            input.splitEachLine("\t") { fields ->
+                def output_string = ""
+                for (int i in 0..(fields.size() - 2)) {
+                    if (i % 3 < keyMaps.size()) {
+                        def val = keyMaps[i % 3][fields[i]]
+                        if (val == null)
+                            log.severe("`${fields[i]}' not found in dictionary")
+                        output_stream.print("${val}\t")
+                    } else {
+                        output_stream.print("${fields[i]}\t")
+                    }
+                }
+                output_stream.println(output_string + fields[fields.size() - 1])
+            }
+            log.info("Done pseudonymizing ${fname} to ${outName}")
+            output_stream.close()
+        }
+    })
+}
\ No newline at end of file