| #!/bin/env groovy |
| @Grab('info.picocli:picocli-groovy:4.6.3') |
| @Grab('org.tukaani:xz:1.9') // should be imported even if not used directly |
| @Grab('org.codehaus.gpars:gpars:1.2.1') |
| @Grab('org.apache.commons:commons-compress:1.22') |
| import groovy.cli.Option |
| import groovy.cli.Unparsed |
| import groovy.cli.commons.CliBuilder |
| import groovyx.gpars.GParsPool |
| import org.apache.commons.compress.compressors.CompressorStreamFactory |
| |
| |
| import java.util.concurrent.ConcurrentHashMap |
| import java.util.logging.Logger |
| |
| def tag = "pseudonymize" |
| |
| interface pseudonymizeArgs { |
| @Option(shortName = 'k') |
| String[] keys() |
| |
| @Option(shortName = 'd') |
| String destPath() |
| |
| @Option(shortName = 'h', defaultValue = '0') |
| boolean help() |
| |
| @Unparsed |
| List files() |
| } |
| |
| def compressorOutputStream(fname) { |
| def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2] |
| String extension = fname.substring(fname.lastIndexOf(".") + 1) |
| def type = compresserTypes[extension] |
| if (type) |
| return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname))))) |
| else |
| return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname))) |
| } |
| |
| static def readKey(fname, log) { |
| def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1) |
| log.info("Reading key ${fname} ...") |
| def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) |
| input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) } |
| log.info("Done reading key ${fname}.") |
| return myMap |
| } |
| |
| CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]") |
| def options = cli.parseFromSpec(pseudonymizeArgs, args) |
| |
| if (options.help() || !options.files() || options.files()[0].startsWith('-')) { |
| cli.usage() |
| System.exit(-1) |
| } |
| |
| System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n') |
| Logger log = Logger.getLogger("") |
| log.info("Pseudonymizing ${options.keys().size()} columns") |
| def keyMaps = [] |
| |
| options.keys().each { fname -> keyMaps << readKey(fname, log) } |
| |
| GParsPool.withPool { |
| options.files().eachParallel(fname -> { |
| def outName = options.destPath() + "/" + new File(fname).getName() |
| def outFile = new File(outName) |
| if (outFile.exists()) { |
| log.warning("${outName} already exists - skipping") |
| } else { |
| log.info("Pseudonymizing ${fname} to ${outName}") |
| def output_stream = compressorOutputStream(outName) |
| def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname))) |
| |
| input.splitEachLine("\t") { fields -> |
| def output_string = "" |
| for (int i in 0..(fields.size() - 2)) { |
| if (i % 3 < keyMaps.size()) { |
| def val = keyMaps[i % 3][fields[i]] |
| if (val == null) |
| log.severe("`${fields[i]}' not found in dictionary") |
| output_stream.print("${val}\t") |
| } else { |
| output_stream.print("${fields[i]}\t") |
| } |
| } |
| output_stream.println(output_string + fields[fields.size() - 1]) |
| } |
| log.info("Done pseudonymizing ${fname} to ${outName}") |
| output_stream.close() |
| } |
| }) |
| } |