blob: afc24355f3d6146df6d714cd087519d2c38fe750 [file] [log] [blame]
Marc Kupietz42e37e12022-11-04 08:52:27 +01001#!/bin/env groovy
2@Grab('info.picocli:picocli-groovy:4.6.3')
3@Grab('org.tukaani:xz:1.9')
4@Grab('org.codehaus.gpars:gpars:1.2.1')
5@Grab('org.apache.commons:commons-compress:1.22')
6
7import groovy.cli.Option
8import groovy.cli.Unparsed
9import groovy.cli.commons.CliBuilder
10import groovyx.gpars.GParsPool
11import org.apache.commons.compress.compressors.CompressorStreamFactory
12
13
14import java.util.concurrent.ConcurrentHashMap
15import java.util.concurrent.atomic.LongAdder
16import java.util.logging.Logger
17
18def tag = "generate_pseudonym_key"
19
20interface GeneratePseudonymKeyArgs {
21 @Option(shortName = 'c', defaultValue = '0', description = 'generate pseudonyms for column n')
22 int column()
23
24 @Option(shortName = 's', defaultValue = "«END»,«START»", convert = { it.split(",")}, description = "comma separated special keys (will get pseudonyms -n..-1)")
25 String[] specialKeys()
26
27 @Option(shortName = 'h')
28 boolean help()
29
30 @Unparsed
31 List files()
32}
33
34CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
35def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args)
36
37if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
38 cli.usage()
39 System.exit(-1)
40}
41
42def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size())
43System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
44Logger log = Logger.getLogger("org.ids_mannheim.${tag}")
45log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}")
46
47GParsPool.withPool {
48 options.files().eachParallel(fname -> {
49 log.info("Reading ${fname}")
50 def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
51 input.splitEachLine("\t") { fields ->
52 freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1]))
53 }
54 log.info("Done reading ${fname}")
55 })
56}
57
58log.info("Sorting and writing...")
59
60def j = -options.specialKeys().size()
61options.specialKeys().each {
62 freqList.remove(it)
63 println("${it}\t${j++}")
64}
65
66def i = 0
67freqList.entrySet()
68 .parallelStream()
69 .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret })
70 .forEachOrdered({ e -> println "${e.key}\t${i++}" })