Use cryptogrphic Blake2b hash as determisitic fold random source
The original standard hash code function was in effect partially
dependent on the order of the texts.
diff --git a/pom.xml b/pom.xml
index 54b31bf..beda604 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
<groupId>groupId</groupId>
<artifactId>nGrammFoldCount</artifactId>
- <version>1.3</version>
+ <version>1.5</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -162,5 +162,10 @@
<version>RELEASE</version>
<scope>test</scope>
</dependency>
+ <dependency>
+ <groupId>org.bouncycastle</groupId>
+ <artifactId>bcprov-jdk15on</artifactId>
+ <version>1.66</version>
+ </dependency>
</dependencies>
</project>
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/DeterministicRandomProvider.java b/src/main/java/org/ids_mannheim/DeterministicRandomProvider.java
new file mode 100644
index 0000000..2f51f2c
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/DeterministicRandomProvider.java
@@ -0,0 +1,20 @@
+package org.ids_mannheim;
+
+import org.bouncycastle.crypto.digests.Blake2bDigest;
+
+public class DeterministicRandomProvider {
+ private final Blake2bDigest b2bd = new Blake2bDigest(null, 1, null, null);
+ private final byte[] out_bytes = new byte[4];
+ private int max_values;
+
+ public DeterministicRandomProvider(int max_values) {
+ this.max_values = max_values;
+ }
+
+ public int getFoldFromTextID(String id) {
+ b2bd.update(id.getBytes(), 0, id.length());
+ b2bd.doFinal(out_bytes, 0);
+ return Byte.toUnsignedInt(out_bytes[0]) % max_values;
+ }
+}
+
diff --git a/src/main/java/org/ids_mannheim/Utils.java b/src/main/java/org/ids_mannheim/Utils.java
index 6af95f0..966236a 100644
--- a/src/main/java/org/ids_mannheim/Utils.java
+++ b/src/main/java/org/ids_mannheim/Utils.java
@@ -2,7 +2,6 @@
import java.io.File;
import java.io.IOException;
-import java.nio.channels.FileChannel;
import java.nio.file.AccessDeniedException;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
@@ -38,9 +37,5 @@
}
return f;
}
-
- public static int getFoldFromTextID(String id, int max_fold) {
- return Math.abs(id.hashCode() % max_fold);
- }
}
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index 238e004..18a14bd 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -20,13 +20,13 @@
private final ArrayList<String> fnames;
private final BlockingQueue<Integer> queue;
private final ConcurrentHashMap<String, AtomicInteger> map;
- private final int folds;
private final Progressbar etaPrinter;
private final int ngram_size;
private final int target_fold;
private final Logger logger;
private final WorkerNodePool pool;
private final boolean with_lemma_and_pos;
+ private final DeterministicRandomProvider deterministicRandomProvider;
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int target_fold, int folds,
ConcurrentHashMap<String, AtomicInteger> map,
@@ -36,12 +36,12 @@
this.fnames = fnames;
this.map = map;
this.ngram_size = ngram_size;
- this.folds = folds;
this.target_fold = target_fold;
this.with_lemma_and_pos = with_lemma_and_pos;
this.pool = pool;
this.etaPrinter = etaPrinter;
this.logger = logger;
+ this.deterministicRandomProvider = new DeterministicRandomProvider(folds);
}
@Override
@@ -81,9 +81,9 @@
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);
if (matcher.find()) {
- fold = Utils.getFoldFromTextID(matcher.group(1), folds + 1);
+ fold = deterministicRandomProvider.getFoldFromTextID(matcher.group(1)) + 1;
texts++;
- if(fold == target_fold) {
+ if (fold == target_fold) {
slidingWindowQueue.reset(fold);
}
}
diff --git a/src/test/java/org/ids_mannheim/UtilsTest.java b/src/test/java/org/ids_mannheim/UtilsTest.java
index 02b9c04..a93db3c 100644
--- a/src/test/java/org/ids_mannheim/UtilsTest.java
+++ b/src/test/java/org/ids_mannheim/UtilsTest.java
@@ -12,11 +12,12 @@
@Test
void randomFoldIsDeterministic() {
- assertEquals(404783, Utils.getFoldFromTextID("RPO05_JAN.00001", 1000000));
- assertEquals(404782, Utils.getFoldFromTextID("RPO05_JAN.00002", 1000000));
- assertEquals(404781, Utils.getFoldFromTextID("RPO05_JAN.00003", 1000000));
- assertEquals(404753, Utils.getFoldFromTextID("RPO05_JAN.00010", 1000000));
- assertEquals(936451, Utils.getFoldFromTextID("RPO05_JUN.00001", 1000000));
- assertEquals(936450, Utils.getFoldFromTextID("RPO05_JUN.00002", 1000000));
+ DeterministicRandomProvider drp = new DeterministicRandomProvider(100);
+ assertEquals(89, drp.getFoldFromTextID("RPO05_JAN.00001"));
+ assertEquals(47, drp.getFoldFromTextID("RPO05_JAN.00002"));
+ assertEquals(0, drp.getFoldFromTextID("RPO05_JAN.00003"));
+ assertEquals(91, drp.getFoldFromTextID("RPO05_JAN.00010"));
+ assertEquals(53, drp.getFoldFromTextID("RPO05_JUN.00001"));
+ assertEquals(94, drp.getFoldFromTextID("RPO05_JUN.00002"));
}
}
\ No newline at end of file