totalngrams: for .(freq|tsv)(.gz)? input files automatically cumulate frequencies
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..ffa68d0
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,4 @@
+# Changelog
+
+## [1.9-SNAPSHOT] - 2020-11-25
+- for `.*\\.(freq|tsv)(\\.gz)?` input files automatically cumulate frequencies
diff --git a/pom.xml b/pom.xml
index 517804f..e33b4b2 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,12 +6,11 @@
<groupId>groupId</groupId>
<artifactId>nGrammFoldCount</artifactId>
- <version>1.8-SNAPSHOT</version>
+ <version>1.9-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.release>11</maven.compiler.release>
- <jackson.version>[2.10.3,)</jackson.version>
</properties>
<build>
diff --git a/src/main/java/org/ids_mannheim/FoldedEntry.java b/src/main/java/org/ids_mannheim/FoldedEntry.java
index 5a43746..a29714b 100644
--- a/src/main/java/org/ids_mannheim/FoldedEntry.java
+++ b/src/main/java/org/ids_mannheim/FoldedEntry.java
@@ -36,6 +36,15 @@
});
}
+ public static void add(ConcurrentHashMap<String, AtomicInteger> map, String ngram, int add) {
+ map.compute(ngram, (key, value) -> {
+ if (value == null) {
+ value = new AtomicInteger();
+ }
+ value.addAndGet(add);
+ return value;
+ });
+ }
public static void incr(ConcurrentHashMap<String, FoldedEntry> map, String ngram, int fold) {
map.compute(ngram, (key, value) -> {
if (value == null) {
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index 3e89faa..f342eef 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -49,6 +49,7 @@
try {
int index = queue.take();
int retries = MAX_RETRIES;
+ int texts = 0;
SlidingWindowQueue slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s));
while (index >= 0) {
String fname = fnames.get(index);
@@ -56,6 +57,7 @@
long file_size = current_file.length();
int poolIndex = 0;
BufferedReader in = null;
+ logger.info(String.format("Processing %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
if (fname.matches(".*\\.conllu\\.gz$")) {
in = new BufferedReader(new InputStreamReader(new ParallelGZIPInputStream(new FileInputStream(fname))));
} else if (fname.matches(".*\\.conllu?$")) {
@@ -69,14 +71,28 @@
};
Process p = Runtime.getRuntime().exec(cmd);
in = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ } else if (fname.matches(".*\\.(freq|tsv)(\\.gz)?$")) {
+ in = new BufferedReader(new InputStreamReader(
+ (fname.matches("\\.gz$") ?
+ new ParallelGZIPInputStream(new FileInputStream(fname)) :
+ new FileInputStream(fname))));
+ String line;
+ while((line = in.readLine()) != null) {
+ int tabPos = line.lastIndexOf('\t');
+ if (tabPos >= 0) {
+ FoldedEntry.add(map, line.substring(0, tabPos), Integer.parseInt(line.substring(tabPos+1)));
+ } else {
+ logger.severe(new IllegalArgumentException("Cannot interpret tsv line: "+line).toString());
+ System.exit(-1);
+ }
+ }
+ texts++;
} else {
logger.severe(new IllegalArgumentException("Cannot guess file type for "+fname).toString());
System.exit(-1);
}
- logger.info(String.format("Processing %d/%d %s %s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
String line;
int fold = -1;
- int texts = 0;
while ((line = in.readLine()) != null) {
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);