Add groovy script for generation pseudonym keys
Change-Id: I8e8e14ca6551f82b73a097ed544b3977b11f573a
diff --git a/scripts/generate_pseudonym_key.groovy b/scripts/generate_pseudonym_key.groovy
new file mode 100755
index 0000000..afc2435
--- /dev/null
+++ b/scripts/generate_pseudonym_key.groovy
@@ -0,0 +1,70 @@
+#!/bin/env groovy
+@Grab('info.picocli:picocli-groovy:4.6.3')
+@Grab('org.tukaani:xz:1.9')
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+
+import groovy.cli.Option
+import groovy.cli.Unparsed
+import groovy.cli.commons.CliBuilder
+import groovyx.gpars.GParsPool
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.LongAdder
+import java.util.logging.Logger
+
+def tag = "generate_pseudonym_key"
+
+interface GeneratePseudonymKeyArgs {
+ @Option(shortName = 'c', defaultValue = '0', description = 'generate pseudonyms for column n')
+ int column()
+
+ @Option(shortName = 's', defaultValue = "«END»,«START»", convert = { it.split(",")}, description = "comma separated special keys (will get pseudonyms -n..-1)")
+ String[] specialKeys()
+
+ @Option(shortName = 'h')
+ boolean help()
+
+ @Unparsed
+ List files()
+}
+
+CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
+def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args)
+
+if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+ cli.usage()
+ System.exit(-1)
+}
+
+def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size())
+System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+Logger log = Logger.getLogger("org.ids_mannheim.${tag}")
+log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}")
+
+GParsPool.withPool {
+ options.files().eachParallel(fname -> {
+ log.info("Reading ${fname}")
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+ input.splitEachLine("\t") { fields ->
+ freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1]))
+ }
+ log.info("Done reading ${fname}")
+ })
+}
+
+log.info("Sorting and writing...")
+
+def j = -options.specialKeys().size()
+options.specialKeys().each {
+ freqList.remove(it)
+ println("${it}\t${j++}")
+}
+
+def i = 0
+freqList.entrySet()
+ .parallelStream()
+ .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret })
+ .forEachOrdered({ e -> println "${e.key}\t${i++}" })