Add option -N to sort keys with same freq numerically
Change-Id: I70c4b8c6c5b7f9f6e705716a8d46fd3a23e5de6b
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ffa68d0..9f93351 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,5 @@
# Changelog
-## [1.9-SNAPSHOT] - 2020-11-25
+## [1.9-SNAPSHOT] - 2021-09-15
- for `.*\\.(freq|tsv)(\\.gz)?` input files automatically cumulate frequencies
+- -N option added to sort keys with same frequency numerically
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/TotalNGrams.java b/src/main/java/org/ids_mannheim/TotalNGrams.java
index 952998a..86932d4 100644
--- a/src/main/java/org/ids_mannheim/TotalNGrams.java
+++ b/src/main/java/org/ids_mannheim/TotalNGrams.java
@@ -2,13 +2,13 @@
import org.anarres.parallelgzip.ParallelGZIPOutputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
-import org.tukaani.xz.XZOutputStream;
import picocli.CommandLine;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.PrintStream;
+import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Locale;
import java.util.concurrent.*;
@@ -87,6 +87,11 @@
"--sort"}, description = "Toggle output sorting (default: ${DEFAULT-VALUE})")
boolean sort = true;
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-N",
+ "--numeric-secondary-sort"}, description = "Sort entries with same frequency numerically (default: ${DEFAULT-VALUE})")
+ boolean numericSecondarySort = false;
+
private Progressbar etaPrinter;
public TotalNGrams() {
@@ -187,7 +192,10 @@
if (cmp1 != 0) {
return cmp1;
} else {
- return a.getKey().compareTo(b.getKey());
+ if (numericSecondarySort)
+ return new BigInteger(a.getKey()).compareTo(new BigInteger(b.getKey()));
+ else
+ return a.getKey().compareTo(b.getKey());
}
})
.forEachOrdered(entry -> output_stream.println(entry.getKey() + "\t" + Integer.toUnsignedString(entry.getValue().get())));