Efficient folded ngram frequency adder
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d69402e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+.*
+!/.gitignore
+target
+tmp
+logs
+cache_store
+*.iml
+/bin/
+*.log
+i5validation.json
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..97ca101
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>groupId</groupId>
+    <artifactId>nGrammFoldCount</artifactId>
+    <version>1.0-SNAPSHOT</version>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <maven.compiler.release>11</maven.compiler.release>
+        <jackson.version>[2.10.3,)</jackson.version>
+    </properties>
+
+    <build>
+        <resources>
+            <resource>
+                <directory>src/main/xml</directory>
+                <includes>
+                    <include>**/*.xml</include>
+                </includes>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <version>3.1.1</version>
+                <configuration>
+                    <failOnError>false</failOnError>
+                    <source>${maven.compiler.release}</source>
+                    <javadocExecutable>${java.home}/bin/javadoc
+                    </javadocExecutable>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>attach-javadocs</id>
+                        <goals>
+                            <goal>javadoc-no-fork</goal>
+                            <goal>jar</goal>
+                        </goals>
+                        <configuration>
+                            <failOnError>false</failOnError>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <version>3.2.0</version>
+                <configuration>
+                    <archive>
+                        <index>true</index>
+                        <manifest>
+                            <addClasspath>true</addClasspath>
+                            <classpathPrefix>dependency/</classpathPrefix>
+                            <mainClass>org.ids_mannheim.TotalNGram</mainClass>
+                        </manifest>
+                    </archive>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.8.1</version>
+                <configuration>
+                    <!-- or whatever version you use -->
+                    <release>${maven.compiler.release}</release>
+                    <source>${maven.compiler.release}</source>
+                    <target>${maven.compiler.release}</target>
+                    <showDeprecation>true</showDeprecation>
+                    <annotationProcessorPaths>
+                        <path>
+                            <groupId>info.picocli</groupId>
+                            <artifactId>picocli-codegen</artifactId>
+                            <version>4.2.0</version>
+                        </path>
+                    </annotationProcessorPaths>
+                    <compilerArgs>
+                        <arg>-Aproject=${project.groupId}/${project.artifactId}
+                        </arg>
+                    </compilerArgs>
+                </configuration>
+                <executions>
+                    <!-- Replacing default-compile as it is treated specially by maven -->
+                    <execution>
+                        <id>default-compile</id>
+                        <phase>none</phase>
+                    </execution>
+                    <!-- Replacing default-testCompile as it is treated specially by
+                      maven -->
+                    <execution>
+                        <id>default-testCompile</id>
+                        <phase>none</phase>
+                    </execution>
+                    <execution>
+                        <id>java-compile</id>
+                        <phase>compile</phase>
+                        <goals>
+                            <goal>compile</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>java-test-compile</id>
+                        <phase>test-compile</phase>
+                        <goals>
+                            <goal>testCompile</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <archive>
+                        <index>true</index>
+                        <manifest>
+                            <addClasspath>true</addClasspath>
+                            <classpathPrefix>dependency/</classpathPrefix>
+                            <mainClass>org.ids_mannheim.TotalNGram</mainClass>
+                        </manifest>
+                    </archive>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+    <dependencies>
+        <!-- https://mvnrepository.com/artifact/gnu.getopt/java-getopt -->
+        <dependency>
+            <groupId>info.picocli</groupId>
+            <artifactId>picocli</artifactId>
+            <version>4.2.0</version>
+        </dependency>
+        <dependency>
+            <groupId>com.vdurmont</groupId>
+            <artifactId>etaprinter</artifactId>
+            <version>2.0.0</version>
+        </dependency>
+    </dependencies>
+</project>
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/FoldedEntry.java b/src/main/java/org/ids_mannheim/FoldedEntry.java
new file mode 100644
index 0000000..ee105c6
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/FoldedEntry.java
@@ -0,0 +1,46 @@
+package org.ids_mannheim;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.stream.IntStream;
+
+public class FoldedEntry implements Comparable<FoldedEntry> {
+    static int FOLDS = 10;
+    final AtomicLongArray count;
+
+    public static void setFolds(int folds) {
+        FOLDS = folds;
+    }
+
+    public FoldedEntry() {
+        count = new AtomicLongArray(FOLDS + 1);
+    }
+
+    @Override
+    public int compareTo(FoldedEntry foldedEntry) {
+        if (foldedEntry == null) {
+            return -1;
+        } else {
+            return Long.compare(count.get(0), foldedEntry.count.get(0));
+        }
+    }
+
+    public static void incr(ConcurrentHashMap<String, FoldedEntry> map, String ngram, int fold) {
+        map.compute(ngram, (key, value) -> {
+            if (value == null) {
+                value = new FoldedEntry();
+            }
+            value.count.incrementAndGet(fold);
+            value.count.incrementAndGet(0);
+            return value;
+        });
+    }
+
+    @Override
+    public String toString() {
+        StringBuilder b = new StringBuilder();
+        IntStream.rangeClosed(1, FOLDS).forEach(i -> b.append("\t").append(count.get(i)));
+        b.append("\t").append(count.get(0));
+        return b.toString();
+    }
+}
diff --git a/src/main/java/org/ids_mannheim/TotalNGram.java b/src/main/java/org/ids_mannheim/TotalNGram.java
new file mode 100644
index 0000000..6db5308
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/TotalNGram.java
@@ -0,0 +1,82 @@
+package org.ids_mannheim;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.concurrent.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.vdurmont.etaprinter.ETAPrinter;
+import picocli.CommandLine;
+
+@CommandLine.Command(mixinStandardHelpOptions = true,
+        name = "totalngram", description = "add ngram counts from KorAP-XML or CoNLL-U files")
+
+public class TotalNGram implements Callable<Integer> {
+    private static final int MAX_THREADS = Runtime.getRuntime().availableProcessors() * 2 / 3;
+    private ETAPrinter etaPrinter;
+
+    @CommandLine.Option(names = {"-L",
+            "--log-file"}, defaultValue = "sum.log", description = "log file name (default: ${DEFAULT-VALUE})")
+    String logFileName;
+
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"-o",
+            "--output-file"}, description = "Output file (default: -")
+    File output_file = null;
+
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"-P",
+            "--max-procs"}, description = "Run up to max-procs processes at a time (default: ${DEFAULT-VALUE})")
+    int max_threads = MAX_THREADS;
+
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"-f",
+            "--folds"}, description = "number of folds (default: ${DEFAULT-VALUE})")
+    int FOLDS = 10;
+
+    @CommandLine.Parameters(arity = "1..*", description = "input files")
+    private final ArrayList<String> inputFiles = new ArrayList<>();
+
+    public TotalNGram() {
+    }
+
+    @Override
+    public Integer call() throws Exception {
+        PrintStream output_stream = (output_file == null || output_file.equals("-") ? System.out : new PrintStream(output_file));
+
+        FoldedEntry.setFolds(FOLDS);
+        ConcurrentHashMap<String, FoldedEntry> map = new ConcurrentHashMap<>();
+        long totalFilesSizes = inputFiles.parallelStream().mapToLong(fname -> new File(fname).length()).sum();
+        etaPrinter = ETAPrinter.init("sum", totalFilesSizes, System.err, false);
+        BlockingQueue<Integer> queue = new LinkedBlockingQueue<>(inputFiles.size());
+        ExecutorService es = Executors.newCachedThreadPool();
+        int threads = Math.min(max_threads, inputFiles.size());
+        IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, FOLDS, map, etaPrinter)));
+        queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
+        IntStream.range(0, threads).forEach(unused -> {
+            try {
+                queue.put(-1);
+            } catch (InterruptedException e) {
+                e.printStackTrace();
+            }
+        });
+        es.shutdown();
+        boolean finished = es.awaitTermination(120, TimeUnit.HOURS);
+        map.entrySet().stream()
+                .sorted(Collections.reverseOrder(new ValueThenKeyComparator<>()))
+                .forEach(entry -> output_stream.println(entry.getKey() + entry.getValue().toString()));
+        IntStream.rangeClosed(1, FOLDS)
+                .forEach(i -> output_stream.print("\t" + map.values()
+                        .parallelStream().mapToLong(e -> e.count.get(i)).sum()));
+        output_stream.println("\t" + map.values().parallelStream().mapToLong(e -> e.count.get(0)).sum());
+        return null;
+    }
+
+    public static void main(String[] args) throws FileNotFoundException {
+        System.exit(new CommandLine(new TotalNGram()).execute(args));
+    }
+}
diff --git a/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java b/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java
new file mode 100644
index 0000000..1623462
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java
@@ -0,0 +1,19 @@
+package org.ids_mannheim;
+
+import java.util.Comparator;
+import java.util.Map;
+
+public class ValueThenKeyComparator<K extends Comparable<? super K>,
+        V extends Comparable<? super V>>
+        implements Comparator<Map.Entry<K, V>> {
+
+    public int compare(Map.Entry<K, V> a, Map.Entry<K, V> b) {
+        int cmp1 = a.getValue().compareTo(b.getValue());
+        if (cmp1 != 0) {
+            return cmp1;
+        } else {
+            return a.getKey().compareTo(b.getKey());
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
new file mode 100644
index 0000000..0619233
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -0,0 +1,69 @@
+package org.ids_mannheim;
+
+import com.vdurmont.etaprinter.ETAPrinter;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Worker implements Runnable {
+    private static final Pattern new_text_pattern = Pattern.compile("^#\\s+text_id\\s*=\\s*(.+)");
+    private final ArrayList<String> fnames;
+    private final BlockingQueue<Integer> queue;
+    private final ConcurrentHashMap<String, FoldedEntry> map;
+    private final int folds;
+    private final ETAPrinter etaPrinter;
+
+    public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int folds, ConcurrentHashMap<String, FoldedEntry> map, ETAPrinter etaPrinter) {
+        this.queue = queue;
+        this.fnames = fnames;
+        this.map = map;
+        this.folds = folds;
+        this.etaPrinter = etaPrinter;
+    }
+
+    @Override
+    public void run() {
+        try {
+            int index = queue.take();
+            while (index >= 0) {
+                String fname = fnames.get(index);
+                long file_size = new File(fname).length();
+                System.err.println(Thread.currentThread().getName() + " - processing: " + fname);
+                String[] cmd = {
+                        "/bin/sh",
+                        "-c",
+                        "/usr/local/kl/bin/korapxml2conllu " + fname
+                };
+                Process p = Runtime.getRuntime().exec(cmd);
+                BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
+                String line;
+                int fold = -1;
+                while ((line = in.readLine()) != null) {
+                    if (line.startsWith("#")) {
+                        Matcher matcher = new_text_pattern.matcher(line);
+                        if (matcher.find()) {
+                            fold = Math.abs(matcher.group(1).hashCode()) % folds +1;
+                        }
+                        continue;
+                    }
+                    String[] strings = line.split("\\s+");
+                    if (strings.length < 4) {
+                        continue;
+                    }
+                    FoldedEntry.incr(map, strings[1], fold);
+                }
+                etaPrinter.update(file_size);
+                index = queue.take();
+            }
+        } catch (InterruptedException | IOException e) {
+            Thread.currentThread().interrupt();
+        }
+    }
+}