blob: 70447fdc0c38bf97304b0e758732764ffc7eb8f1 [file] [log] [blame]
Marc Kupietze582d9c2022-11-04 15:16:05 +01001#!/bin/env groovy
2@Grab('info.picocli:picocli-groovy:4.6.3')
3@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
4@Grab('org.codehaus.gpars:gpars:1.2.1')
5@Grab('org.apache.commons:commons-compress:1.22')
6import groovy.cli.Option
7import groovy.cli.Unparsed
8import groovy.cli.commons.CliBuilder
9import groovyx.gpars.GParsPool
10import org.apache.commons.compress.compressors.CompressorStreamFactory
11
12
13import java.util.concurrent.ConcurrentHashMap
14import java.util.logging.Logger
15
16def tag = "pseudonymize"
17
18interface pseudonymizeArgs {
19 @Option(shortName = 'k')
20 String[] keys()
21
22 @Option(shortName = 'd')
23 String destPath()
24
25 @Option(shortName = 'h', defaultValue = '0')
26 boolean help()
27
28 @Unparsed
29 List files()
30}
31
32def compressorOutputStream(fname) {
33 def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
34 String extension = fname.substring(fname.lastIndexOf(".") + 1)
35 def type = compresserTypes[extension]
36 if (type)
37 return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
38 else
39 return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
40}
41
42static def readKey(fname, log) {
43 def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
44 log.info("Reading key ${fname} ...")
45 def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
46 input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
47 log.info("Done reading key ${fname}.")
48 return myMap
49}
50
51CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
52def options = cli.parseFromSpec(pseudonymizeArgs, args)
53
54if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
55 cli.usage()
56 System.exit(-1)
57}
58
59System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
60Logger log = Logger.getLogger("")
61log.info("Pseudonymizing ${options.keys().size()} columns")
62def keyMaps = []
63
64options.keys().each { fname -> keyMaps << readKey(fname, log) }
65
66GParsPool.withPool {
67 options.files().eachParallel(fname -> {
68 def outName = options.destPath() + "/" + new File(fname).getName()
69 def outFile = new File(outName)
70 if (outFile.exists()) {
71 log.warning("${outName} already exists - skipping")
72 } else {
73 log.info("Pseudonymizing ${fname} to ${outName}")
74 def output_stream = compressorOutputStream(outName)
75 def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
76
77 input.splitEachLine("\t") { fields ->
78 def output_string = ""
79 for (int i in 0..(fields.size() - 2)) {
80 if (i % 3 < keyMaps.size()) {
81 def val = keyMaps[i % 3][fields[i]]
82 if (val == null)
83 log.severe("`${fields[i]}' not found in dictionary")
84 output_stream.print("${val}\t")
85 } else {
86 output_stream.print("${fields[i]}\t")
87 }
88 }
89 output_stream.println(output_string + fields[fields.size() - 1])
90 }
91 log.info("Done pseudonymizing ${fname} to ${outName}")
92 output_stream.close()
93 }
94 })
95}