Incorporate pseudonymization scripts into maven project
And set minimum java version to 17
Change-Id: Ieb8c9c0cd64214111cb7ef11775ea0272f46054c
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97bde2c..4cc443d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,7 +1,7 @@
include:
- template: Security/Dependency-Scanning.gitlab-ci.yml
-image: maven:3.6-jdk-11
+image: maven:latest
build-and-test:
stage: build
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 04917f9..73282e3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,9 @@
# Changelog
+## [2.1.0]
+- added script 'GeneratePseudonymKey.groovy' to compute pseudonyms
+- added script `Pseudonymize.groovy` to pseudonymize tokens (and lemmas)
+
## [2.0] - 2021-10-07
- for `.*\\.(freq|tsv)(\\.gz)?` input files automatically cumulate frequencies
- -N option added to sort keys with same frequency numerically
diff --git a/config.groovy b/config.groovy
new file mode 100644
index 0000000..1471ad0
--- /dev/null
+++ b/config.groovy
@@ -0,0 +1 @@
+configuration.pluginFactory = org.codehaus.groovy.control.ParserPluginFactory.antlr4()
diff --git a/pom.xml b/pom.xml
index 74ceb7b..4a20c45 100644
--- a/pom.xml
+++ b/pom.xml
@@ -10,8 +10,12 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <maven.compiler.release>11</maven.compiler.release>
- </properties>
+ <maven.compiler.source>17</maven.compiler.source>
+ <maven.compiler.target>17</maven.compiler.target>
+ <!-- Use the release flag only if you are using Java 9+ -->
+ <!-- <maven.compiler.release>8</maven.compiler.release> -->
+ <!-- verbose is useful for debugging purposes -->
+ <maven.compiler.verbose>true</maven.compiler.verbose> </properties>
<build>
<resources>
@@ -28,14 +32,16 @@
</includes>
</resource>
</resources>
+
<plugins>
+
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-javadoc-plugin</artifactId>
<version>3.1.1</version>
<configuration>
<failOnError>false</failOnError>
- <source>${maven.compiler.release}</source>
+ <source>${maven.compiler.target}</source>
<javadocExecutable>${java.home}/bin/javadoc
</javadocExecutable>
</configuration>
@@ -70,12 +76,11 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
- <version>3.8.1</version>
+ <version>3.10.1</version>
<configuration>
<!-- or whatever version you use -->
- <release>${maven.compiler.release}</release>
- <source>${maven.compiler.release}</source>
- <target>${maven.compiler.release}</target>
+ <source>${maven.compiler.source}</source>
+ <target>${maven.compiler.target}</target>
<showDeprecation>true</showDeprecation>
<annotationProcessorPaths>
<path>
@@ -84,10 +89,6 @@
<version>4.2.0</version>
</path>
</annotationProcessorPaths>
- <compilerArgs>
- <arg>-Aproject=${project.groupId}/${project.artifactId}
- </arg>
- </compilerArgs>
</configuration>
<executions>
<!-- Replacing default-compile as it is treated specially by maven -->
@@ -117,8 +118,9 @@
</execution>
</executions>
</plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
+
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
@@ -172,11 +174,85 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
- <!-- JUnit 5 requires Surefire version 2.22.0 or higher -->
<version>2.22.2</version>
+ <configuration>
+ <includes>
+ <include>**/*Test.java</include>
+ </includes>
+ </configuration>
</plugin>
</plugins>
</build>
+
+ <profiles>
+ <profile>
+ <id>activate-this-only-outside-intellij</id>
+ <activation>
+ <property>
+ <name>!idea.version</name>
+ </property>
+ </activation>
+
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.10.1</version>
+ <configuration>
+ <compilerId>groovy-eclipse-compiler</compilerId>
+ <compilerArguments>
+ <configScript>${project.basedir}/config.groovy</configScript>
+ </compilerArguments>
+ </configuration>
+ <dependencies>
+ <dependency>
+ <groupId>org.codehaus.groovy</groupId>
+ <artifactId>groovy-eclipse-compiler</artifactId>
+ <version>3.8.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codehaus.groovy</groupId>
+ <artifactId>groovy-eclipse-batch</artifactId>
+ <version>4.0.6-02</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.ivy</groupId>
+ <artifactId>ivy</artifactId>
+ <version>2.5.1</version>
+ </dependency>
+ </dependencies>
+ </plugin>
+ </plugins>
+ </build>
+ <dependencies>
+ <dependency>
+ <groupId>org.codehaus.groovy</groupId>
+ <artifactId>groovy-eclipse-compiler</artifactId>
+ <version>3.8.0</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codehaus.groovy</groupId>
+ <artifactId>groovy-eclipse-batch</artifactId>
+ <version>4.0.6-02</version>
+ </dependency>
+ </dependencies>
+
+ <repositories>
+ <repository>
+ <id>groovy-libs-release-local</id>
+ <url>https://groovy.jfrog.io/artifactory/plugins-release-local</url>
+ </repository>
+ </repositories>
+
+ <pluginRepositories>
+ <pluginRepository>
+ <id>groovy-plugins-release-local</id>
+ <url>https://groovy.jfrog.io/artifactory/plugins-release-local</url>
+ </pluginRepository>
+ </pluginRepositories>
+ </profile>
+ </profiles>
+
<dependencies>
<dependency>
<groupId>info.picocli</groupId>
@@ -229,5 +305,43 @@
<artifactId>xz</artifactId>
<version>1.9</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.groovy</groupId>
+ <artifactId>groovy</artifactId>
+ <version>4.0.6</version>
+ </dependency>
+
+ <!-- https://mvnrepository.com/artifact/org.codehaus.groovy/groovy-eclipse-batch -->
+
+ <dependency>
+ <groupId>org.codehaus.gpars</groupId>
+ <artifactId>gpars</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.ivy</groupId>
+ <artifactId>ivy</artifactId>
+ <version>2.5.1</version>
+ <scope>compile</scope>
+ <optional>true</optional>
+ </dependency>
+ <dependency>
+ <groupId>info.picocli</groupId>
+ <artifactId>picocli-groovy</artifactId>
+ <version>4.6.3</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.groovy</groupId>
+ <artifactId>groovy-cli-picocli</artifactId>
+ <version>4.0.6</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-cli</groupId>
+ <artifactId>commons-cli</artifactId>
+ <version>1.2</version>
+ </dependency>
</dependencies>
+
+
</project>
\ No newline at end of file
diff --git a/scripts/generate_pseudonym_key.groovy b/scripts/generate_pseudonym_key.groovy
deleted file mode 100755
index afc2435..0000000
--- a/scripts/generate_pseudonym_key.groovy
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/env groovy
-@Grab('info.picocli:picocli-groovy:4.6.3')
-@Grab('org.tukaani:xz:1.9')
-@Grab('org.codehaus.gpars:gpars:1.2.1')
-@Grab('org.apache.commons:commons-compress:1.22')
-
-import groovy.cli.Option
-import groovy.cli.Unparsed
-import groovy.cli.commons.CliBuilder
-import groovyx.gpars.GParsPool
-import org.apache.commons.compress.compressors.CompressorStreamFactory
-
-
-import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.atomic.LongAdder
-import java.util.logging.Logger
-
-def tag = "generate_pseudonym_key"
-
-interface GeneratePseudonymKeyArgs {
- @Option(shortName = 'c', defaultValue = '0', description = 'generate pseudonyms for column n')
- int column()
-
- @Option(shortName = 's', defaultValue = "«END»,«START»", convert = { it.split(",")}, description = "comma separated special keys (will get pseudonyms -n..-1)")
- String[] specialKeys()
-
- @Option(shortName = 'h')
- boolean help()
-
- @Unparsed
- List files()
-}
-
-CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
-def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args)
-
-if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
- cli.usage()
- System.exit(-1)
-}
-
-def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size())
-System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
-Logger log = Logger.getLogger("org.ids_mannheim.${tag}")
-log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}")
-
-GParsPool.withPool {
- options.files().eachParallel(fname -> {
- log.info("Reading ${fname}")
- def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
- input.splitEachLine("\t") { fields ->
- freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1]))
- }
- log.info("Done reading ${fname}")
- })
-}
-
-log.info("Sorting and writing...")
-
-def j = -options.specialKeys().size()
-options.specialKeys().each {
- freqList.remove(it)
- println("${it}\t${j++}")
-}
-
-def i = 0
-freqList.entrySet()
- .parallelStream()
- .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret })
- .forEachOrdered({ e -> println "${e.key}\t${i++}" })
diff --git a/scripts/pseudonymize.groovy b/scripts/pseudonymize.groovy
deleted file mode 100755
index 70447fd..0000000
--- a/scripts/pseudonymize.groovy
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/bin/env groovy
-@Grab('info.picocli:picocli-groovy:4.6.3')
-@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
-@Grab('org.codehaus.gpars:gpars:1.2.1')
-@Grab('org.apache.commons:commons-compress:1.22')
-import groovy.cli.Option
-import groovy.cli.Unparsed
-import groovy.cli.commons.CliBuilder
-import groovyx.gpars.GParsPool
-import org.apache.commons.compress.compressors.CompressorStreamFactory
-
-
-import java.util.concurrent.ConcurrentHashMap
-import java.util.logging.Logger
-
-def tag = "pseudonymize"
-
-interface pseudonymizeArgs {
- @Option(shortName = 'k')
- String[] keys()
-
- @Option(shortName = 'd')
- String destPath()
-
- @Option(shortName = 'h', defaultValue = '0')
- boolean help()
-
- @Unparsed
- List files()
-}
-
-def compressorOutputStream(fname) {
- def compresserTypes = ["gz" : CompressorStreamFactory.GZIP, "xz" : CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
- String extension = fname.substring(fname.lastIndexOf(".") + 1)
- def type = compresserTypes[extension]
- if (type)
- return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
- else
- return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
-}
-
-static def readKey(fname, log) {
- def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
- log.info("Reading key ${fname} ...")
- def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
- input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
- log.info("Done reading key ${fname}.")
- return myMap
-}
-
-CliBuilder cli = new CliBuilder(usage: "${tag} [options] file [files]")
-def options = cli.parseFromSpec(pseudonymizeArgs, args)
-
-if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
- cli.usage()
- System.exit(-1)
-}
-
-System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
-Logger log = Logger.getLogger("")
-log.info("Pseudonymizing ${options.keys().size()} columns")
-def keyMaps = []
-
-options.keys().each { fname -> keyMaps << readKey(fname, log) }
-
-GParsPool.withPool {
- options.files().eachParallel(fname -> {
- def outName = options.destPath() + "/" + new File(fname).getName()
- def outFile = new File(outName)
- if (outFile.exists()) {
- log.warning("${outName} already exists - skipping")
- } else {
- log.info("Pseudonymizing ${fname} to ${outName}")
- def output_stream = compressorOutputStream(outName)
- def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
-
- input.splitEachLine("\t") { fields ->
- def output_string = ""
- for (int i in 0..(fields.size() - 2)) {
- if (i % 3 < keyMaps.size()) {
- def val = keyMaps[i % 3][fields[i]]
- if (val == null)
- log.severe("`${fields[i]}' not found in dictionary")
- output_stream.print("${val}\t")
- } else {
- output_stream.print("${fields[i]}\t")
- }
- }
- output_stream.println(output_string + fields[fields.size() - 1])
- }
- log.info("Done pseudonymizing ${fname} to ${outName}")
- output_stream.close()
- }
- })
-}
\ No newline at end of file
diff --git a/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
new file mode 100755
index 0000000..4c5397c
--- /dev/null
+++ b/src/main/groovy/org/ids_mannheim/GeneratePseudonymKey.groovy
@@ -0,0 +1,80 @@
+#!/bin/env groovy
+package org.ids_mannheim
+
+import groovy.cli.Option
+import groovy.cli.Unparsed
+@GrabConfig(systemClassLoader = true)
+@Grab('info.picocli:picocli-groovy:4.6.3')
+@Grab('org.tukaani:xz:1.9')
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+
+
+
+import groovy.cli.picocli.CliBuilder
+import groovyx.gpars.GParsPool
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.atomic.LongAdder
+import java.util.logging.Logger
+
+class GeneratePseudonymKey {
+ static tag = "GeneratePseudonymKey"
+
+ static interface GeneratePseudonymKeyArgs {
+ @Option(shortName = 'c', defaultValue = "0", description = 'generate pseudonyms for column n')
+ int column()
+
+ @Option(shortName = 's', defaultValue = "«END»,«START»", description = "comma separated special keys (will get pseudonyms -n..-1)")
+ String specialKeys()
+
+ @Option(shortName = 'h')
+ boolean help()
+
+ @Unparsed
+ String[] files()
+ }
+
+ static void main(String[] args) {
+ CliBuilder cli = new CliBuilder(name: "${tag}", width: 120, footer: "Examples:\n" +
+ "generate_pseudonym_key.groovy -c 0 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > token_key.tsv.xz\n" +
+ "generate_pseudonym_key.groovy -c 1 1-gram-token-l-freqs.*.tsv.xz | xz -T0 > lemma_key.tsv.xz\n")
+
+ def options = cli.parseFromSpec(GeneratePseudonymKeyArgs, args)
+
+ if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+ cli.usage()
+ System.exit(-1)
+ }
+ def specialKeys = options.specialKeys().split(",")
+ def freqList = new ConcurrentHashMap<String, LongAdder>(10000000, 0.75, options.files().size())
+ System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+ Logger log = Logger.getLogger("org.ids_mannheim.${tag}")
+ log.info("Generating pseudonym key for column ${options.column()} in ${options.files()}")
+
+ GParsPool.withPool {
+ options.files().eachParallel(fname -> {
+ log.info("Reading ${fname}")
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+ input.splitEachLine("\t") { fields -> freqList.computeIfAbsent(fields[options.column()], k -> new LongAdder()).add(Long.parseLong(fields[fields.size() - 1]))
+ }
+ log.info("Done reading ${fname}")
+ })
+ }
+
+ log.info("Sorting and writing...")
+
+ def j = -specialKeys.size()
+ specialKeys.each {
+ freqList.remove(it)
+ println("${it}\t${j++}")
+ }
+
+ def i = 0
+ freqList.entrySet()
+ .parallelStream()
+ .sorted({ a, b -> def ret = b.value <=> a.value; if (ret == 0) a.key <=> b.key else ret })
+ .forEachOrdered({ e -> println "${e.key}\t${i++}" })
+ }
+}
diff --git a/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy b/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy
new file mode 100755
index 0000000..f1c1196
--- /dev/null
+++ b/src/main/groovy/org/ids_mannheim/Pseudonymize.groovy
@@ -0,0 +1,114 @@
+#!/bin/env groovy
+
+package org.ids_mannheim
+
+// @GrabConfig(systemProperties = "groovy.grape.enable=false")
+@GrabConfig(systemClassLoader=true)
+@Grab('org.tukaani:xz:1.9') // should be imported even if not used directly
+@Grab('org.codehaus.gpars:gpars:1.2.1')
+@Grab('org.apache.commons:commons-compress:1.22')
+@Grab('info.picocli:picocli:4.6.3')
+
+import groovyx.gpars.GParsPool
+
+import groovy.cli.Option
+import groovy.cli.Unparsed
+import groovy.cli.picocli.CliBuilder
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory
+
+import java.util.concurrent.ConcurrentHashMap
+import java.util.logging.Logger
+
+class Pseudonymize {
+
+ static tag = "Pseudonymize"
+
+ static interface pseudonymizeArgs {
+ @Option(shortName = 'k')
+ String[] keys()
+
+ @Option(shortName = 'd', defaultValue = "./")
+ String destPath()
+
+ @Option(shortName = 'h')
+ boolean help()
+
+ @Unparsed()
+ List files()
+ }
+
+ static def compressorOutputStream(fname) {
+ def compresserTypes = ["gz": CompressorStreamFactory.GZIP, "xz": CompressorStreamFactory.XZ, "bzip": CompressorStreamFactory.BZIP2]
+ String extension = fname.substring(fname.lastIndexOf(".") + 1)
+ def type = compresserTypes[extension]
+ if (type)
+ return new PrintStream(new BufferedOutputStream(new CompressorStreamFactory().createCompressorOutputStream(type, new BufferedOutputStream(new FileOutputStream(fname)))))
+ else
+ return new PrintStream(new BufferedOutputStream(new FileOutputStream(fname)))
+ }
+
+ static def readKey(fname, log) {
+ def myMap = new ConcurrentHashMap<String, Integer>(100000000,0.75,1)
+ log.info("Reading key ${fname} ...")
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+ input.splitEachLine("\t") { fields -> myMap.put(fields[0], Integer.valueOf(fields[fields.size() - 1])) }
+ log.info("Done reading key ${fname}.")
+ return myMap
+ }
+
+
+ static void main(String[] args) {
+ System.setProperty("java.util.logging.SimpleFormatter.format", '[%1$tF %1$tT]:%4$s: %5$s%n')
+ Logger log = Logger.getLogger("")
+
+ CliBuilder cli = new CliBuilder(name: tag, width: 100)
+
+ def options = cli.parseFromSpec(pseudonymizeArgs, args)
+
+ if (options.help() || !options.files() || options.files()[0].startsWith('-')) {
+ cli.usage()
+ System.exit(-1)
+ }
+
+ pseudonymizeFiles(options.files(), options.keys(), options.destPath(), log)
+ }
+
+ static void pseudonymizeFiles(files, keys, destPath, Logger log) {
+ log.info("Pseudonymizing ${keys.size()} columns")
+ def keyMaps = []
+
+ keys.each { fname -> keyMaps << readKey(fname, log) }
+
+ GParsPool.withPool {
+ files.eachParallel(fname -> {
+ def outName = destPath + "/" + new File(fname).getName()
+ def outFile = new File(outName)
+ if (outFile.exists()) {
+ log.warning("${outName} already exists - skipping")
+ } else {
+ log.info("Pseudonymizing ${fname} to ${outName}")
+ def output_stream = compressorOutputStream(outName)
+ def input = new CompressorStreamFactory().createCompressorInputStream(new BufferedInputStream(new FileInputStream(fname)))
+
+ input.splitEachLine("\t") { fields ->
+ def output_string = ""
+ for (int i in 0..(fields.size() - 2)) {
+ if (i % 3 < keyMaps.size()) {
+ def val = keyMaps[i % 3][fields[i]]
+ if (val == null)
+ log.severe("`${fields[i]}' not found in dictionary")
+ output_stream.print("${val}\t")
+ } else {
+ output_stream.print("${fields[i]}\t")
+ }
+ }
+ output_stream.println(output_string + fields[fields.size() - 1])
+ }
+ log.info("Done pseudonymizing ${fname} to ${outName}")
+ output_stream.close()
+ }
+ })
+ }
+ }
+}
diff --git a/src/test/java/org/ids_mannheim/PseudonymizeTest.java b/src/test/java/org/ids_mannheim/PseudonymizeTest.java
new file mode 100644
index 0000000..b7aaef1
--- /dev/null
+++ b/src/test/java/org/ids_mannheim/PseudonymizeTest.java
@@ -0,0 +1,67 @@
+package org.ids_mannheim;
+
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.PrintStream;
+import java.nio.file.Path;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.logging.Logger;
+
+public class PseudonymizeTest {
+ private static final ByteArrayOutputStream outContent = new ByteArrayOutputStream();
+ private static final ByteArrayOutputStream errContent = new ByteArrayOutputStream();
+ private static final PrintStream originalOut = System.out;
+ private static final PrintStream originalErr = System.err;
+
+ @BeforeAll
+ public static void setUpStreams() {
+ System.setOut(new PrintStream(outContent));
+ System.setErr(new PrintStream(errContent));
+ }
+
+ @AfterAll
+ public static void restoreStreams() {
+ System.setOut(originalOut);
+ System.setErr(originalErr);
+ }
+
+ @BeforeAll
+ public static void disableGrapeGrab() {
+ System.setProperty("groovy.grape.enable", "false");
+ }
+
+ @Test
+ void generateKeyTest() {
+ GeneratePseudonymKey.main(new String[]{"src/test/resources/token_lemma_pos_freqs.tsv.xz"});
+ assertTrue(outContent.toString().contains("\ndurchaus\t934\n"));
+ assertTrue(outContent.toString().contains("\nlängst\t936\n"));
+ assertFalse(outContent.toString().contains("\t937\n"));
+ }
+
+ @Test
+ void readKeyTest() {
+ ConcurrentHashMap map = (ConcurrentHashMap) Pseudonymize.readKey("src/test/resources/token.keys.gz", Logger.getLogger(""));
+ assertEquals(936, map.get("längst"));
+ }
+
+ @TempDir
+ static Path sharedTempDir;
+
+ @Test
+ void pseudonymizeFilesTest() {
+ Logger log = Logger.getLogger("");
+ String fname = sharedTempDir + File.separator + "token_lemma_pos_freqs.tsv.xz";
+ new File(fname).delete();
+ Pseudonymize.pseudonymizeFiles(new String[]{"src/test/resources/token_lemma_pos_freqs.tsv.xz"},
+ new String[]{"src/test/resources/token.keys.gz", "src/test/resources/lemma.keys.gz"}, sharedTempDir.toAbsolutePath().toString(), log);
+ ConcurrentHashMap pseudonymized = (ConcurrentHashMap) Pseudonymize.readKey(fname, log);
+ assertEquals(223997, pseudonymized.get("936"));
+ }
+}
\ No newline at end of file
diff --git a/src/test/resources/lemma.keys.gz b/src/test/resources/lemma.keys.gz
new file mode 100644
index 0000000..3b366b4
--- /dev/null
+++ b/src/test/resources/lemma.keys.gz
Binary files differ
diff --git a/src/test/resources/token.keys.gz b/src/test/resources/token.keys.gz
new file mode 100644
index 0000000..66c0ea5
--- /dev/null
+++ b/src/test/resources/token.keys.gz
Binary files differ
diff --git a/src/test/resources/token_lemma_pos_freqs.tsv.xz b/src/test/resources/token_lemma_pos_freqs.tsv.xz
new file mode 100644
index 0000000..52ec718
--- /dev/null
+++ b/src/test/resources/token_lemma_pos_freqs.tsv.xz
Binary files differ