Use unsigned integers instead of longs for counts
For German this should be OK upto 80 billion words corpora and it saves
memory.
diff --git a/src/main/java/org/ids_mannheim/FoldedEntry.java b/src/main/java/org/ids_mannheim/FoldedEntry.java
index ee105c6..028561a 100644
--- a/src/main/java/org/ids_mannheim/FoldedEntry.java
+++ b/src/main/java/org/ids_mannheim/FoldedEntry.java
@@ -1,19 +1,19 @@
package org.ids_mannheim;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.concurrent.atomic.AtomicIntegerArray;
import java.util.stream.IntStream;
public class FoldedEntry implements Comparable<FoldedEntry> {
static int FOLDS = 10;
- final AtomicLongArray count;
+ final AtomicIntegerArray count;
public static void setFolds(int folds) {
FOLDS = folds;
}
public FoldedEntry() {
- count = new AtomicLongArray(FOLDS + 1);
+ count = new AtomicIntegerArray(FOLDS + 1);
}
@Override
@@ -21,7 +21,7 @@
if (foldedEntry == null) {
return -1;
} else {
- return Long.compare(count.get(0), foldedEntry.count.get(0));
+ return Integer.compareUnsigned(count.get(0), foldedEntry.count.get(0));
}
}
@@ -39,7 +39,7 @@
@Override
public String toString() {
StringBuilder b = new StringBuilder();
- IntStream.rangeClosed(1, FOLDS).forEach(i -> b.append("\t").append(count.get(i)));
+ IntStream.rangeClosed(1, FOLDS).forEach(i -> b.append("\t").append(Integer.toUnsignedString(count.get(i))));
b.append("\t").append(count.get(0));
return b.toString();
}
diff --git a/src/main/java/org/ids_mannheim/TotalNGram.java b/src/main/java/org/ids_mannheim/TotalNGram.java
index 3d4149c..47c4ec1 100644
--- a/src/main/java/org/ids_mannheim/TotalNGram.java
+++ b/src/main/java/org/ids_mannheim/TotalNGram.java
@@ -138,9 +138,10 @@
logger.info("Calculating column sums.");
System.err.println("Calculating column sums.");
IntStream.rangeClosed(1, FOLDS)
- .forEach(i -> output_stream.print("\t" + map.values()
- .parallelStream().mapToLong(e -> e.count.get(i)).sum()));
- output_stream.println("\t" + map.values().parallelStream().mapToLong(e -> e.count.get(0)).sum());
+ .parallel()
+ .forEachOrdered(i -> output_stream.print("\t" + map.values()
+ .parallelStream().mapToLong(e -> Integer.toUnsignedLong(e.count.get(i))).sum()));
+ output_stream.println("\t" + map.values().parallelStream().mapToLong(e -> Integer.toUnsignedLong(e.count.get(0))).sum());
return null;
}
}