| #!/bin/env groovy |
| @Grab('info.picocli:picocli-groovy:4.6.3') |
| @Grab('org.tukaani:xz:1.9') |
| @Grab('org.codehaus.gpars:gpars:1.2.1') |
| @Grab('org.apache.commons:commons-compress:1.22') |
| |
| import groovy.cli.Option |
| import groovy.cli.Unparsed |
| import groovy.cli.commons.CliBuilder |
| import groovyx.gpars.GParsPool |
| import org.apache.commons.compress.compressors.CompressorStreamFactory |
| |
| |
| import java.util.concurrent.ConcurrentHashMap |
| import java.util.concurrent.atomic.LongAdder |
| import java.util.logging.Logger |
| |
| def tag = "generate_pseudonym_key" |
| |
| interface GeneratePseudonymKeyArgs { |
| @Option(shortName = 'c', defaultValue = '0', description = 'generate pseudonyms for column n') |
| int column() |
| |
| @Option(shortName = 's', defaultValue = "«END»,«START»", convert = { it.split(",")}, description = "comma separated special keys (will get pseudonyms -n..-1)") |
| String[] specialKeys() |
| |
| @Option(shortName = 'h') |
| boolean help() |
| |
| @Unparsed |
| List files() |
| } |
| |
| CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]") |
| def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args) |
| |
| if (options.help() || !options.files() || options.files()[0].startsWith('-')) { |
| cli.usage() |
| System.exit(-1) |
| } |
| |
| def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size()) |
| System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n') |
| Logger log = Logger.getLogger("org.ids_mannheim.${tag}") |
| log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}") |
| |
| GParsPool.withPool { |
| options.files().eachParallel(fname -> { |
| log.info("Reading ${fname}") |
| def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) |
| input.splitEachLine("\t") { fields -> |
| freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1])) |
| } |
| log.info("Done reading ${fname}") |
| }) |
| } |
| |
| log.info("Sorting and writing...") |
| |
| def j = -options.specialKeys().size() |
| options.specialKeys().each { |
| freqList.remove(it) |
| println("${it}\t${j++}") |
| } |
| |
| def i = 0 |
| freqList.entrySet() |
| .parallelStream() |
| .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret }) |
| .forEachOrdered({ e -> println "${e.key}\t${i++}" }) |