Marc Kupietz | e582d9c | 2022-11-04 15:16:05 +0100 | [diff] [blame^] | 1 | #!/bin/env groovy |
| 2 | @Grab('info.picocli:picocli-groovy:4.6.3') |
| 3 | @Grab('org.tukaani:xz:1.9') // should be imported even if not used directly |
| 4 | @Grab('org.codehaus.gpars:gpars:1.2.1') |
| 5 | @Grab('org.apache.commons:commons-compress:1.22') |
| 6 | import groovy.cli.Option |
| 7 | import groovy.cli.Unparsed |
| 8 | import groovy.cli.commons.CliBuilder |
| 9 | import groovyx.gpars.GParsPool |
| 10 | import org.apache.commons.compress.compressors.CompressorStreamFactory |
| 11 | |
| 12 | |
| 13 | import java.util.concurrent.ConcurrentHashMap |
| 14 | import java.util.logging.Logger |
| 15 | |
| 16 | def tag = "pseudonymize" |
| 17 | |
| 18 | interface pseudonymizeArgs { |
| 19 | @Option(shortName = 'k') |
| 20 | String[] keys() |
| 21 | |
| 22 | @Option(shortName = 'd') |
| 23 | String destPath() |
| 24 | |
| 25 | @Option(shortName = 'h', defaultValue = '0') |
| 26 | boolean help() |
| 27 | |
| 28 | @Unparsed |
| 29 | List files() |
| 30 | } |
| 31 | |
| 32 | def compressorOutputStream(fname) { |
| 33 | def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2] |
| 34 | String extension = fname.substring(fname.lastIndexOf(".") + 1) |
| 35 | def type = compresserTypes[extension] |
| 36 | if (type) |
| 37 | return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname))))) |
| 38 | else |
| 39 | return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname))) |
| 40 | } |
| 41 | |
| 42 | static def readKey(fname, log) { |
| 43 | def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1) |
| 44 | log.info("Reading key ${fname} ...") |
| 45 | def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) |
| 46 | input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) } |
| 47 | log.info("Done reading key ${fname}.") |
| 48 | return myMap |
| 49 | } |
| 50 | |
| 51 | CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]") |
| 52 | def options = cli.parseFromSpec(pseudonymizeArgs, args) |
| 53 | |
| 54 | if (options.help() || !options.files() || options.files()[0].startsWith('-')) { |
| 55 | cli.usage() |
| 56 | System.exit(-1) |
| 57 | } |
| 58 | |
| 59 | System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n') |
| 60 | Logger log = Logger.getLogger("") |
| 61 | log.info("Pseudonymizing ${options.keys().size()} columns") |
| 62 | def keyMaps = [] |
| 63 | |
| 64 | options.keys().each { fname -> keyMaps << readKey(fname, log) } |
| 65 | |
| 66 | GParsPool.withPool { |
| 67 | options.files().eachParallel(fname -> { |
| 68 | def outName = options.destPath() + "/" + new File(fname).getName() |
| 69 | def outFile = new File(outName) |
| 70 | if (outFile.exists()) { |
| 71 | log.warning("${outName} already exists - skipping") |
| 72 | } else { |
| 73 | log.info("Pseudonymizing ${fname} to ${outName}") |
| 74 | def output_stream = compressorOutputStream(outName) |
| 75 | def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) |
| 76 | |
| 77 | input.splitEachLine("\t") { fields -> |
| 78 | def output_string = "" |
| 79 | for (int i in 0..(fields.size() - 2)) { |
| 80 | if (i % 3 < keyMaps.size()) { |
| 81 | def val = keyMaps[i % 3][fields[i]] |
| 82 | if (val == null) |
| 83 | log.severe("`${fields[i]}' not found in dictionary") |
| 84 | output_stream.print("${val}\t") |
| 85 | } else { |
| 86 | output_stream.print("${fields[i]}\t") |
| 87 | } |
| 88 | } |
| 89 | output_stream.println(output_string + fields[fields.size() - 1]) |
| 90 | } |
| 91 | log.info("Done pseudonymizing ${fname} to ${outName}") |
| 92 | output_stream.close() |
| 93 | } |
| 94 | }) |
| 95 | } |