Add groovy script for pseudonymizing ngram lists
Change-Id: Ic141f32d683d8672136d0612e9fd639069e40413
diff --git a/scripts/pseudonymize.groovy b/scripts/pseudonymize.groovy
new file mode 100755
index 0000000..70447fd
--- /dev/null
+++ b/scripts/pseudonymize.groovy
@@ -0,0 +1,95 @@
+#!/bin/env groovy
+@Grab('info.picocli:picocli-groovy:4.6.3')
+@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+import groovy.cli.Option
+import groovy.cli.Unparsed
+import groovy.cli.commons.CliBuilder
+import groovyx.gpars.GParsPool
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.logging.Logger
+
+def tag = "pseudonymize"
+
+interface pseudonymizeArgs {
+ @Option(shortName = 'k')
+ String[] keys()
+
+ @Option(shortName = 'd')
+ String destPath()
+
+ @Option(shortName = 'h', defaultValue = '0')
+ boolean help()
+
+ @Unparsed
+ List files()
+}
+
+def compressorOutputStream(fname) {
+ def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
+ String extension = fname.substring(fname.lastIndexOf(".") + 1)
+ def type = compresserTypes[extension]
+ if (type)
+ return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
+ else
+ return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
+}
+
+static def readKey(fname, log) {
+ def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
+ log.info("Reading key ${fname} ...")
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+ input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
+ log.info("Done reading key ${fname}.")
+ return myMap
+}
+
+CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
+def options = cli.parseFromSpec(pseudonymizeArgs, args)
+
+if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+ cli.usage()
+ System.exit(-1)
+}
+
+System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+Logger log = Logger.getLogger("")
+log.info("Pseudonymizing ${options.keys().size()} columns")
+def keyMaps = []
+
+options.keys().each { fname -> keyMaps << readKey(fname, log) }
+
+GParsPool.withPool {
+ options.files().eachParallel(fname -> {
+ def outName = options.destPath() + "/" + new File(fname).getName()
+ def outFile = new File(outName)
+ if (outFile.exists()) {
+ log.warning("${outName} already exists - skipping")
+ } else {
+ log.info("Pseudonymizing ${fname} to ${outName}")
+ def output_stream = compressorOutputStream(outName)
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+
+ input.splitEachLine("\t") { fields ->
+ def output_string = ""
+ for (int i in 0..(fields.size() - 2)) {
+ if (i % 3 < keyMaps.size()) {
+ def val = keyMaps[i % 3][fields[i]]
+ if (val == null)
+ log.severe("`${fields[i]}' not found in dictionary")
+ output_stream.print("${val}\t")
+ } else {
+ output_stream.print("${fields[i]}\t")
+ }
+ }
+ output_stream.println(output_string + fields[fields.size() - 1])
+ }
+ log.info("Done pseudonymizing ${fname} to ${outName}")
+ output_stream.close()
+ }
+ })
+}
\ No newline at end of file