| Marc Kupietz | 42e37e1 | 2022-11-04 08:52:27 +0100 | [diff] [blame] | 1 | #!/bin/env groovy | 
|  | 2 | @Grab('info.picocli:picocli-groovy:4.6.3') | 
|  | 3 | @Grab('org.tukaani:xz:1.9') | 
|  | 4 | @Grab('org.codehaus.gpars:gpars:1.2.1') | 
|  | 5 | @Grab('org.apache.commons:commons-compress:1.22') | 
|  | 6 |  | 
|  | 7 | import groovy.cli.Option | 
|  | 8 | import groovy.cli.Unparsed | 
|  | 9 | import groovy.cli.commons.CliBuilder | 
|  | 10 | import groovyx.gpars.GParsPool | 
|  | 11 | import org.apache.commons.compress.compressors.CompressorStreamFactory | 
|  | 12 |  | 
|  | 13 |  | 
|  | 14 | import java.util.concurrent.ConcurrentHashMap | 
|  | 15 | import java.util.concurrent.atomic.LongAdder | 
|  | 16 | import java.util.logging.Logger | 
|  | 17 |  | 
|  | 18 | def tag = "generate_pseudonym_key" | 
|  | 19 |  | 
|  | 20 | interface GeneratePseudonymKeyArgs { | 
|  | 21 | @Option(shortName = 'c', defaultValue = '0', description = 'generate pseudonyms for column n') | 
|  | 22 | int column() | 
|  | 23 |  | 
|  | 24 | @Option(shortName = 's', defaultValue = "«END»,«START»", convert = { it.split(",")}, description = "comma separated special keys (will get pseudonyms -n..-1)") | 
|  | 25 | String[] specialKeys() | 
|  | 26 |  | 
|  | 27 | @Option(shortName = 'h') | 
|  | 28 | boolean help() | 
|  | 29 |  | 
|  | 30 | @Unparsed | 
|  | 31 | List files() | 
|  | 32 | } | 
|  | 33 |  | 
|  | 34 | CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]") | 
|  | 35 | def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args) | 
|  | 36 |  | 
|  | 37 | if (options.help() || !options.files() || options.files()[0].startsWith('-')) { | 
|  | 38 | cli.usage() | 
|  | 39 | System.exit(-1) | 
|  | 40 | } | 
|  | 41 |  | 
|  | 42 | def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size()) | 
|  | 43 | System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n') | 
|  | 44 | Logger log = Logger.getLogger("org.ids_mannheim.${tag}") | 
|  | 45 | log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}") | 
|  | 46 |  | 
|  | 47 | GParsPool.withPool { | 
|  | 48 | options.files().eachParallel(fname -> { | 
|  | 49 | log.info("Reading ${fname}") | 
|  | 50 | def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) | 
|  | 51 | input.splitEachLine("\t") { fields -> | 
|  | 52 | freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1])) | 
|  | 53 | } | 
|  | 54 | log.info("Done reading ${fname}") | 
|  | 55 | }) | 
|  | 56 | } | 
|  | 57 |  | 
|  | 58 | log.info("Sorting and writing...") | 
|  | 59 |  | 
|  | 60 | def j = -options.specialKeys().size() | 
|  | 61 | options.specialKeys().each { | 
|  | 62 | freqList.remove(it) | 
|  | 63 | println("${it}\t${j++}") | 
|  | 64 | } | 
|  | 65 |  | 
|  | 66 | def i = 0 | 
|  | 67 | freqList.entrySet() | 
|  | 68 | .parallelStream() | 
|  | 69 | .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret }) | 
|  | 70 | .forEachOrdered({ e -> println "${e.key}\t${i++}" }) |