blob: 70447fdc0c38bf97304b0e758732764ffc7eb8f1 [file] [log] [blame]
#!/bin/env groovy
@Grab('info.picocli:picocli-groovy:4.6.3')
@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
@Grab('org.codehaus.gpars:gpars:1.2.1')
@Grab('org.apache.commons:commons-compress:1.22')
import groovy.cli.Option
import groovy.cli.Unparsed
import groovy.cli.commons.CliBuilder
import groovyx.gpars.GParsPool
import org.apache.commons.compress.compressors.CompressorStreamFactory
import java.util.concurrent.ConcurrentHashMap
import java.util.logging.Logger
def tag = "pseudonymize"
interface pseudonymizeArgs {
@Option(shortName = 'k')
String[] keys()
@Option(shortName = 'd')
String destPath()
@Option(shortName = 'h', defaultValue = '0')
boolean help()
@Unparsed
List files()
}
def compressorOutputStream(fname) {
def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
String extension = fname.substring(fname.lastIndexOf(".") + 1)
def type = compresserTypes[extension]
if (type)
return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
else
return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
}
static def readKey(fname, log) {
def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
log.info("Reading key ${fname} ...")
def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
log.info("Done reading key ${fname}.")
return myMap
}
CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
def options = cli.parseFromSpec(pseudonymizeArgs, args)
if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
cli.usage()
System.exit(-1)
}
System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
Logger log = Logger.getLogger("")
log.info("Pseudonymizing ${options.keys().size()} columns")
def keyMaps = []
options.keys().each { fname -> keyMaps << readKey(fname, log) }
GParsPool.withPool {
options.files().eachParallel(fname -> {
def outName = options.destPath() + "/" + new File(fname).getName()
def outFile = new File(outName)
if (outFile.exists()) {
log.warning("${outName} already exists - skipping")
} else {
log.info("Pseudonymizing ${fname} to ${outName}")
def output_stream = compressorOutputStream(outName)
def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
input.splitEachLine("\t") { fields ->
def output_string = ""
for (int i in 0..(fields.size() - 2)) {
if (i % 3 < keyMaps.size()) {
def val = keyMaps[i % 3][fields[i]]
if (val == null)
log.severe("`${fields[i]}' not found in dictionary")
output_stream.print("${val}\t")
} else {
output_stream.print("${fields[i]}\t")
}
}
output_stream.println(output_string + fields[fields.size() - 1])
}
log.info("Done pseudonymizing ${fname} to ${outName}")
output_stream.close()
}
})
}