totalngrams: log warnings and retry on errors
diff --git a/src/main/java/org/ids_mannheim/TotalNGram.java b/src/main/java/org/ids_mannheim/TotalNGram.java
index 752307c..2d19a1f 100644
--- a/src/main/java/org/ids_mannheim/TotalNGram.java
+++ b/src/main/java/org/ids_mannheim/TotalNGram.java
@@ -7,22 +7,36 @@
import java.nio.file.AccessDeniedException;
import java.nio.file.FileAlreadyExistsException;
import java.nio.file.Files;
-import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Locale;
import java.util.concurrent.*;
+import java.util.logging.FileHandler;
+import java.util.logging.Logger;
+import java.util.logging.SimpleFormatter;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
+
@CommandLine.Command(mixinStandardHelpOptions = true,
name = "totalngram", description = "add ngram counts from KorAP-XML or CoNLL-U files")
public class TotalNGram implements Callable<Integer> {
private static final int MAX_THREADS = Runtime.getRuntime().availableProcessors() * 2 / 3;
- private Progressbar etaPrinter;
+ static private final Logger logger;
+ static {
+ String path = TotalNGram.class.getClassLoader()
+ .getResource("logging.properties")
+ .getFile();
+ System.setProperty("java.util.logging.config.file", path);
+ logger = Logger.getLogger(TotalNGram.class.getSimpleName());
+ }
+
+ @CommandLine.Parameters(arity = "1..*", description = "input files")
+ private final ArrayList<String> inputFiles = new ArrayList<>();
@CommandLine.Option(names = {"-L",
- "--log-file"}, defaultValue = "sum.log", description = "log file name (default: ${DEFAULT-VALUE})")
+ "--log-file"}, defaultValue = "totalngram.log", description = "log file name (default: ${DEFAULT-VALUE})")
String logFileName;
@SuppressWarnings("CanBeFinal")
@@ -53,15 +67,27 @@
@CommandLine.Option(names = {"-f",
"--folds"}, description = "number of folds (default: ${DEFAULT-VALUE})")
int FOLDS = 10;
-
- @CommandLine.Parameters(arity = "1..*", description = "input files")
- private final ArrayList<String> inputFiles = new ArrayList<>();
+ private Progressbar etaPrinter;
public TotalNGram() {
}
+ public static void main(String[] args) {
+ System.exit(new CommandLine(new TotalNGram()).execute(args));
+ }
+
@Override
public Integer call() throws Exception {
+ try {
+ FileHandler fileHandler = new FileHandler(logFileName);
+ fileHandler.setFormatter(new SimpleFormatter());
+ //java.util.logging.SimpleFormatter.format="%1$tb %1$td, %1$tY %1$tl:%1$tM:%1$tS %1$Tp %2$s%n%4$s: %5$s%n";
+ java.util.Locale.setDefault(Locale.Category.FORMAT, Locale.ROOT);
+ logger.addHandler(fileHandler);
+
+ } catch (Exception e) {
+ }
+
PrintStream output_stream;
if ((output_fillename == null) || output_fillename.equals("-")) {
output_stream = System.out;
@@ -92,7 +118,7 @@
max_threads = workerNodePool.size;
}
int threads = Math.min(max_threads, inputFiles.size());
- IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, FOLDS, map, workerNodePool, etaPrinter)));
+ IntStream.range(0, threads).forEach(unused -> es.execute(new Worker(queue, inputFiles, ngram_size, FOLDS, map, workerNodePool, etaPrinter, logger)));
queue.addAll(IntStream.range(0, inputFiles.size()).boxed().collect(Collectors.toList()));
IntStream.range(0, threads).forEach(unused -> {
try {
@@ -115,8 +141,4 @@
output_stream.println("\t" + map.values().parallelStream().mapToLong(e -> e.count.get(0)).sum());
return null;
}
-
- public static void main(String[] args) {
- System.exit(new CommandLine(new TotalNGram()).execute(args));
- }
}
diff --git a/src/main/java/org/ids_mannheim/Worker.java b/src/main/java/org/ids_mannheim/Worker.java
index a82b0c7..0a62f68 100644
--- a/src/main/java/org/ids_mannheim/Worker.java
+++ b/src/main/java/org/ids_mannheim/Worker.java
@@ -7,23 +7,28 @@
import java.util.ArrayList;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ThreadLocalRandom;
+import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import static java.lang.Thread.sleep;
+
public class Worker implements Runnable {
private static final Pattern new_text_pattern = Pattern.compile("^#\\s+text_id\\s*=\\s*(.+)");
+ private static final int MAX_RETRIES = 10;
private final ArrayList<String> fnames;
private final BlockingQueue<Integer> queue;
private final ConcurrentHashMap<String, FoldedEntry> map;
private final int folds;
private final Progressbar etaPrinter;
private final int ngram_size;
- private WorkerNodePool pool;
+ private final Logger logger;
+ private final WorkerNodePool pool;
+
public Worker(BlockingQueue<Integer> queue, ArrayList<String> fnames, int ngram_size, int folds,
ConcurrentHashMap<String, FoldedEntry> map,
WorkerNodePool pool,
- Progressbar etaPrinter) {
+ Progressbar etaPrinter, Logger logger) {
this.queue = queue;
this.fnames = fnames;
this.map = map;
@@ -31,17 +36,20 @@
this.folds = folds;
this.pool = pool;
this.etaPrinter = etaPrinter;
+ this.logger = logger;
}
@Override
public void run() {
try {
int index = queue.take();
+ int retries = MAX_RETRIES;
while (index >= 0) {
String fname = fnames.get(index);
- long file_size = new File(fname).length();
+ File current_file = new File(fname);
+ long file_size = current_file.length();
int poolIndex = index % pool.size;
- System.err.println(String.format("%4d/%4d %-10s %-10s", index, fnames.size(), pool.getHost(poolIndex), fname));
+ logger.info(String.format("%5d/%5d %-10s %-10s", index, fnames.size(), pool.getHost(poolIndex), current_file.getName()));
String[] cmd = {
"/bin/sh",
"-c",
@@ -52,26 +60,40 @@
String line;
int fold = -1;
SlidingWindowQueue slidingWindowQueue = null;
+ int texts=0;
while ((line = in.readLine()) != null) {
if (line.startsWith("#")) {
Matcher matcher = new_text_pattern.matcher(line);
if (matcher.find()) {
- fold = Math.abs(matcher.group(1).hashCode()) % folds +1;
+ fold = Math.abs(matcher.group(1).hashCode()) % folds + 1;
}
int finalFold = fold;
- slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> {
- FoldedEntry.incr(map, s, finalFold);
- });
+ slidingWindowQueue = new SlidingWindowQueue(ngram_size, s -> FoldedEntry.incr(map, s, finalFold));
+ texts++;
continue;
}
String[] strings = line.split("\\s+");
if (strings.length < 4) {
continue;
}
+ //noinspection ConstantConditions
slidingWindowQueue.add(strings[1]);
}
- etaPrinter.update(file_size);
- index = queue.take();
+ if (texts > 0) {
+ logger.info(pool.getHost(poolIndex)+" finished " + fname + " with "+texts+ " texts");
+ etaPrinter.update(file_size);
+ retries = MAX_RETRIES;
+ index = queue.take();
+ } else {
+ if (--retries > 0) {
+ logger.warning("Retrying " + fname);
+ sleep(1000);
+ } else {
+ logger.severe ("Giving up " + fname);
+ index = queue.take();
+ sleep(1000);
+ }
+ }
}
} catch (InterruptedException | IOException e) {
Thread.currentThread().interrupt();
diff --git a/src/main/resources/logging.properties b/src/main/resources/logging.properties
new file mode 100644
index 0000000..c793dcc
--- /dev/null
+++ b/src/main/resources/logging.properties
@@ -0,0 +1,9 @@
+handlers= java.util.logging.FileHandler
+.level= INFO
+java.util.logging.FileHandler.level = INFO
+java.util.logging.FileHandler.pattern = totalngrams.log
+java.util.logging.FileHandler.limit = 50000
+java.util.logging.FileHandler.count = 1
+java.util.logging.FileHandler.formatter = java.util.logging.SimpleFormatter
+# java.util.logging.SimpleFormatter.format=%4$s: %5$s [%1$tc]%n
+java.util.logging.SimpleFormatter.format="%1$tb %1$td, %1$tY %1$tH:%1$tM:%1$tS %1$Tp %2$s%n%4$s: %5$s%n"