Efficient folded ngram frequency adder
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d69402e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+.*
+!/.gitignore
+target
+tmp
+logs
+cache_store
+*.iml
+/bin/
+*.log
+i5validation.json
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..97ca101
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,154 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>groupId</groupId>
+ <artifactId>nGrammFoldCount</artifactId>
+ <version>1.0-SNAPSHOT</version>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <maven.compiler.release>11</maven.compiler.release>
+ <jackson.version>[2.10.3,)</jackson.version>
+ </properties>
+
+ <build>
+ <resources>
+ <resource>
+ <directory>src/main/xml</directory>
+ <includes>
+ <include>**/*.xml</include>
+ </includes>
+ </resource>
+ </resources>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-javadoc-plugin</artifactId>
+ <version>3.1.1</version>
+ <configuration>
+ <failOnError>false</failOnError>
+ <source>${maven.compiler.release}</source>
+ <javadocExecutable>${java.home}/bin/javadoc
+ </javadocExecutable>
+ </configuration>
+ <executions>
+ <execution>
+ <id>attach-javadocs</id>
+ <goals>
+ <goal>javadoc-no-fork</goal>
+ <goal>jar</goal>
+ </goals>
+ <configuration>
+ <failOnError>false</failOnError>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <version>3.2.0</version>
+ <configuration>
+ <archive>
+ <index>true</index>
+ <manifest>
+ <addClasspath>true</addClasspath>
+ <classpathPrefix>dependency/</classpathPrefix>
+ <mainClass>org.ids_mannheim.TotalNGram</mainClass>
+ </manifest>
+ </archive>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.8.1</version>
+ <configuration>
+ <!-- or whatever version you use -->
+ <release>${maven.compiler.release}</release>
+ <source>${maven.compiler.release}</source>
+ <target>${maven.compiler.release}</target>
+ <showDeprecation>true</showDeprecation>
+ <annotationProcessorPaths>
+ <path>
+ <groupId>info.picocli</groupId>
+ <artifactId>picocli-codegen</artifactId>
+ <version>4.2.0</version>
+ </path>
+ </annotationProcessorPaths>
+ <compilerArgs>
+ <arg>-Aproject=${project.groupId}/${project.artifactId}
+ </arg>
+ </compilerArgs>
+ </configuration>
+ <executions>
+ <!-- Replacing default-compile as it is treated specially by maven -->
+ <execution>
+ <id>default-compile</id>
+ <phase>none</phase>
+ </execution>
+ <!-- Replacing default-testCompile as it is treated specially by
+ maven -->
+ <execution>
+ <id>default-testCompile</id>
+ <phase>none</phase>
+ </execution>
+ <execution>
+ <id>java-compile</id>
+ <phase>compile</phase>
+ <goals>
+ <goal>compile</goal>
+ </goals>
+ </execution>
+ <execution>
+ <id>java-test-compile</id>
+ <phase>test-compile</phase>
+ <goals>
+ <goal>testCompile</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ <configuration>
+ <archive>
+ <index>true</index>
+ <manifest>
+ <addClasspath>true</addClasspath>
+ <classpathPrefix>dependency/</classpathPrefix>
+ <mainClass>org.ids_mannheim.TotalNGram</mainClass>
+ </manifest>
+ </archive>
+ <descriptorRefs>
+ <descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+ <dependencies>
+ <!-- https://mvnrepository.com/artifact/gnu.getopt/java-getopt -->
+ <dependency>
+ <groupId>info.picocli</groupId>
+ <artifactId>picocli</artifactId>
+ <version>4.2.0</version>
+ </dependency>
+ <dependency>
+ <groupId>com.vdurmont</groupId>
+ <artifactId>etaprinter</artifactId>
+ <version>2.0.0</version>
+ </dependency>
+ </dependencies>
+</project>
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/FoldedEntry.java b/src/main/java/org/ids_mannheim/FoldedEntry.java
new file mode 100644
index 0000000..ee105c6
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/FoldedEntry.java
@@ -0,0 +1,46 @@
+package org.ids_mannheim;
+
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLongArray;
+import java.util.stream.IntStream;
+
+public class FoldedEntry implements Comparable<FoldedEntry> {
+ static int FOLDS = 10;
+ final AtomicLongArray count;
+
+ public static void setFolds(int folds) {
+ FOLDS = folds;
+ }
+
+ public FoldedEntry() {
+ count = new AtomicLongArray(FOLDS + 1);
+ }
+
+ @Override
+ public int compareTo(FoldedEntry foldedEntry) {
+ if (foldedEntry == null) {
+ return -1;
+ } else {
+ return Long.compare(count.get(0), foldedEntry.count.get(0));
+ }
+ }
+
+ public static void incr(ConcurrentHashMap<String, FoldedEntry> map, String ngram, int fold) {
+ map.compute(ngram, (key, value) -> {
+ if (value == null) {
+ value = new FoldedEntry();
+ }
+ value.count.incrementAndGet(fold);
+ value.count.incrementAndGet(0);
+ return value;
+ });
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ IntStream.rangeClosed(1, FOLDS).forEach(i -> b.append("\t").append(count.get(i)));
+ b.append("\t").append(count.get(0));
+ return b.toString();
+ }
+}
diff --git a/src/main/java/org/ids_mannheim/TotalNGram.java b/src/main/java/org/ids_mannheim/TotalNGram.java
new file mode 100644
index 0000000..6db5308
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/TotalNGram.java
@@ -0,0 +1,82 @@
+package org.ids_mannheim;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.concurrent.*;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.vdurmont.etaprinter.ETAPrinter;
+import picocli.CommandLine;
+
+@CommandLine.Command(mixinStandardHelpOptions = true,
+ name = "totalngram", description = "add ngram counts from KorAP-XML or CoNLL-U files")
+
+public class TotalNGram implements Callable<Integer> {
+ private static final int MAX_THREADS = Runtime.getRuntime().availableProcessors() * 2 / 3;
+ private ETAPrinter etaPrinter;
+
+ @CommandLine.Option(names = {"-L",
+ "--log-file"}, defaultValue = "sum.log", description = "log file name (default: ${DEFAULT-VALUE})")
+ String logFileName;
+
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-o",
+ "--output-file"}, description = "Output file (default: -")
+ File output_file = null;
+
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-P",
+ "--max-procs"}, description = "Run up to max-procs processes at a time (default: ${DEFAULT-VALUE})")
+ int max_threads = MAX_THREADS;
+
+ @SuppressWarnings("CanBeFinal")
+ @CommandLine.Option(names = {"-f",
+ "--folds"}, description = "number of folds (default: ${DEFAULT-VALUE})")
+ int FOLDS = 10;
+
+ @CommandLine.Parameters(arity = "1..*", description = "input files")
+ private final ArrayList<String> inputFiles = new ArrayList<>();
+
+ public TotalNGram() {
+ }
+
+ @Override
+ public Integer call() throws Exception {
+ PrintStream output_stream = (output_file == null || output_file.equals("-") ? System.out : new PrintStream(output_file));
+
+ FoldedEntry.setFolds(FOLDS);
+ ConcurrentHashMap<String, FoldedEntry> map = new ConcurrentHashMap<>();
+ long totalFilesSizes = inputFiles.parallelStream().mapToLong(fname -> new File(fname).length()).sum();
+ etaPrinter = ETAPrinter.init("sum", totalFilesSizes, System.err, false);
+ BlockingQueue<Integer> queue = new LinkedBlockingQueue<>(inputFiles.size());
+ ExecutorService es = Executors.newCachedThreadPool();
+ int threads = Math.min(max_threads, inputFiles.size());
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, FOLDS, map, etaPrinter)));
+ queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
+ IntStream.range(0, threads).forEach(unused -> {
+ try {
+ queue.put(-1);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ });
+ es.shutdown();
+ boolean finished = es.awaitTermination(120, TimeUnit.HOURS);
+ map.entrySet().stream()
+ .sorted(Collections.reverseOrder(new ValueThenKeyComparator<>()))
+ .forEach(entry -> output_stream.println(entry.getKey() + entry.getValue().toString()));
+ IntStream.rangeClosed(1, FOLDS)
+ .forEach(i -> output_stream.print("\t" + map.values()
+ .parallelStream().mapToLong(e -> e.count.get(i)).sum()));
+ output_stream.println("\t" + map.values().parallelStream().mapToLong(e -> e.count.get(0)).sum());
+ return null;
+ }
+
+ public static void main(String[] args) throws FileNotFoundException {
+ System.exit(new CommandLine(new TotalNGram()).execute(args));
+ }
+}
diff --git a/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java b/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java
new file mode 100644
index 0000000..1623462
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/ValueThenKeyComparator.java
@@ -0,0 +1,19 @@
+package org.ids_mannheim;
+
+import java.util.Comparator;
+import java.util.Map;
+
+public class ValueThenKeyComparator<K extends Comparable<? super K>,
+ V extends Comparable<? super V>>
+ implements Comparator<Map.Entry<K, V>> {
+
+ public int compare(Map.Entry<K, V> a, Map.Entry<K, V> b) {
+ int cmp1 = a.getValue().compareTo(b.getValue());
+ if (cmp1 != 0) {
+ return cmp1;
+ } else {
+ return a.getKey().compareTo(b.getKey());
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
new file mode 100644
index 0000000..0619233
--- /dev/null
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -0,0 +1,69 @@
+package org.ids_mannheim;
+
+import com.vdurmont.etaprinter.ETAPrinter;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class Worker implements Runnable {
+ private static final Pattern new_text_pattern = Pattern.compile("^#\\s+text_id\\s*=\\s*(.+)");
+ private final ArrayList<String> fnames;
+ private final BlockingQueue<Integer> queue;
+ private final ConcurrentHashMap<String, FoldedEntry> map;
+ private final int folds;
+ private final ETAPrinter etaPrinter;
+
+ public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int folds, ConcurrentHashMap<String, FoldedEntry> map, ETAPrinter etaPrinter) {
+ this.queue = queue;
+ this.fnames = fnames;
+ this.map = map;
+ this.folds = folds;
+ this.etaPrinter = etaPrinter;
+ }
+
+ @Override
+ public void run() {
+ try {
+ int index = queue.take();
+ while (index >= 0) {
+ String fname = fnames.get(index);
+ long file_size = new File(fname).length();
+ System.err.println(Thread.currentThread().getName() + " - processing: " + fname);
+ String[] cmd = {
+ "/bin/sh",
+ "-c",
+ "/usr/local/kl/bin/korapxml2conllu " + fname
+ };
+ Process p = Runtime.getRuntime().exec(cmd);
+ BufferedReader in = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ String line;
+ int fold = -1;
+ while ((line = in.readLine()) != null) {
+ if (line.startsWith("#")) {
+ Matcher matcher = new_text_pattern.matcher(line);
+ if (matcher.find()) {
+ fold = Math.abs(matcher.group(1).hashCode()) % folds +1;
+ }
+ continue;
+ }
+ String[] strings = line.split("\\s+");
+ if (strings.length < 4) {
+ continue;
+ }
+ FoldedEntry.incr(map, strings[1], fold);
+ }
+ etaPrinter.update(file_size);
+ index = queue.take();
+ }
+ } catch (InterruptedException | IOException e) {
+ Thread.currentThread().interrupt();
+ }
+ }
+}