Switch progress bar from texts to MB/s
Can be measured more quickly and gives a much better ETA prediction.
Change-Id: If32567e14faab9e44672b96bde449f20f7942594
diff --git a/src/main/java/de/ids_mannheim/korap/index/Indexer.java b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
index 44826a5..952b120 100644
--- a/src/main/java/de/ids_mannheim/korap/index/Indexer.java
+++ b/src/main/java/de/ids_mannheim/korap/index/Indexer.java
@@ -114,15 +114,20 @@
}
private void initProgress (long total) {
- if (total > 0) {
- this.progressEnabled = true;
- this.progressBar = new SimpleProgressBar(total);
- }
+ this.progressEnabled = true;
+ this.progressBar = new SimpleProgressBar(total);
+ this.progressBar.start();
}
- private void stepProgress () {
+ private void initProgressIndeterminate () {
+ this.progressEnabled = true;
+ this.progressBar = new SimpleProgressBar(0);
+ this.progressBar.start();
+ }
+
+ private void stepProgress (long bytes) {
if (this.progressEnabled && this.progressBar != null) {
- this.progressBar.step();
+ this.progressBar.addBytes(bytes);
}
}
@@ -178,7 +183,7 @@
// autocommit initiated by KrillIndex
this.commit();
}
- this.stepProgress();
+ this.stepProgress(new File(file).length());
}
catch (FileNotFoundException e) {
log.error("File " + file + " is not found!");
@@ -201,8 +206,10 @@
*/
private void parseTar (File tarFile) {
try {
- InputStream fileInputStream = new FileInputStream(tarFile);
-
+ CountingInputStream countingStream = new CountingInputStream(new FileInputStream(tarFile));
+ InputStream fileInputStream = countingStream;
+ long prevCompressedBytes = 0;
+
// Check if it's a gzipped tar file
if (tarFile.getName().toLowerCase().endsWith(".tar.gz") ||
tarFile.getName().toLowerCase().endsWith(".tgz")) {
@@ -264,7 +271,9 @@
if ((this.count % this.commitCount) == 0) {
this.commit();
}
- this.stepProgress();
+ long nowBytes = countingStream.getBytesRead();
+ this.stepProgress(nowBytes - prevCompressedBytes);
+ prevCompressedBytes = nowBytes;
}
}
else {
@@ -335,7 +344,8 @@
if ((this.count % this.commitCount) == 0) {
this.commit();
}
- this.stepProgress();
+ long compSize = entry.getCompressedSize();
+ this.stepProgress(compSize > 0 ? compSize : entry.getSize());
}
catch (IOException e) {
log.error("Error reading entry " + entryName + " from zip file " + zipFile.getName(), e);
@@ -480,12 +490,10 @@
indexer.index.setMaxStringLength(KrillProperties.maxTextSize);
}
- // Initialize progress if requested
- if (showProgress && inputPaths != null) {
- long total = countTargetFiles(inputPaths);
- if (total > 0) {
- indexer.initProgress(total);
- }
+ // Initialize progress bar; total is computed instantly from compressed file sizes
+ if (showProgress) {
+ long totalBytes = computeTotalBytes(inputPaths);
+ indexer.initProgress(totalBytes);
}
// Iterate over list of input paths (auto-detect directories vs zip/tar files)
@@ -607,32 +615,171 @@
return total;
}
- // Simple console progress bar with ETA
- private static class SimpleProgressBar {
- private final long total;
- private long current = 0;
+ /**
+ * Compute total compressed bytes for a set of input paths (instant: uses file sizes only).
+ * For directories, sums the sizes of all .json.gz files.
+ * For archive files (zip, tar, tar.gz), uses the file size directly.
+ */
+ public static long computeTotalBytes (String[] inputPaths) {
+ if (inputPaths == null) return 0;
+ Pattern gzPattern = Pattern.compile(".*\\.json\\.gz$");
+ long total = 0L;
+ for (String arg : inputPaths) {
+ File f = new File(arg);
+ if (f.isDirectory()) {
+ String[] list = f.list();
+ if (list != null) {
+ for (String name : list) {
+ if (gzPattern.matcher(name).find())
+ total += new File(f, name).length();
+ }
+ }
+ }
+ else if (f.isFile()) {
+ total += f.length();
+ }
+ }
+ return total;
+ }
+
+ /**
+ * Format a duration in seconds into a human-readable string.
+ * Durations under 1 hour display as MM:SS,
+ * durations under 1 day display as HH:MM:SS,
+ * and longer durations display as Xd HH:MM:SS.
+ *
+ * @param seconds duration in seconds
+ * @return formatted duration string
+ */
+ public static String formatDuration (long seconds) {
+ long d = seconds / 86400;
+ long h = (seconds % 86400) / 3600;
+ long m = (seconds % 3600) / 60;
+ long s = seconds % 60;
+ if (d > 0)
+ return String.format(Locale.US, "%dd %02d:%02d:%02d", d, h, m, s);
+ if (h > 0)
+ return String.format(Locale.US, "%02d:%02d:%02d", h, m, s);
+ else
+ return String.format(Locale.US, "%02d:%02d", m, s);
+ }
+
+ // Minimal counting wrapper to track compressed bytes read from a stream
+ private static class CountingInputStream extends InputStream {
+ private final InputStream wrapped;
+ private volatile long bytesRead = 0;
+
+ CountingInputStream (InputStream wrapped) {
+ this.wrapped = wrapped;
+ }
+
+ @Override
+ public int read () throws IOException {
+ int b = wrapped.read();
+ if (b != -1) bytesRead++;
+ return b;
+ }
+
+ @Override
+ public int read (byte[] buf, int off, int len) throws IOException {
+ int n = wrapped.read(buf, off, len);
+ if (n > 0) bytesRead += n;
+ return n;
+ }
+
+ long getBytesRead () { return bytesRead; }
+ }
+
+ // Simple console progress bar with ETA; supports indeterminate mode until total is known
+ private static class SimpleProgressBar implements Runnable {
+ private volatile long total; // 0 means indeterminate
+ private volatile long current = 0;
+ private volatile boolean running = false;
+ private volatile boolean finished = false;
private final long startTimeMs;
private final int barWidth = 40;
+ private final Thread thread;
+ private int slidePos = 0;
+ private int slideDir = 1; // 1 right, -1 left
SimpleProgressBar (long total) {
this.total = total;
this.startTimeMs = System.currentTimeMillis();
- render();
+ this.thread = new Thread(this, "krill-progress-bar");
+ this.thread.setDaemon(true);
}
- void step () {
- current++;
- render();
+ void start () {
+ running = true;
+ thread.start();
+ }
+
+ void addBytes (long bytes) {
+ current += bytes;
+ }
+
+ void setTotal (long total) {
+ if (total < 0) total = 0;
+ this.total = total;
}
void finish () {
- current = Math.max(current, total);
+ finished = true;
+ running = false;
+ try {
+ thread.join(500);
+ }
+ catch (InterruptedException e) {
+ // ignore
+ }
+ // Final render as completed line if determinate
+ if (total > 0 && current < total) {
+ current = total;
+ }
render();
System.err.println();
}
+ @Override
+ public void run () {
+ // periodic render loop
+ while (running && !finished) {
+ render();
+ try {
+ Thread.sleep(100);
+ }
+ catch (InterruptedException e) {
+ // ignore
+ }
+ }
+ }
+
private void render () {
- double percent = total > 0 ? (double) current / (double) total : 0d;
+ if (total <= 0) {
+ // indeterminate: sliding bar
+ slidePos += slideDir;
+ if (slidePos >= barWidth - 5) {
+ slideDir = -1;
+ }
+ else if (slidePos <= 0) {
+ slideDir = 1;
+ }
+ StringBuilder bar = new StringBuilder(barWidth);
+ for (int i = 0; i < barWidth; i++) bar.append('-');
+ // draw a 5-char slider
+ for (int i = slidePos; i < Math.min(slidePos + 5, barWidth); i++) {
+ bar.setCharAt(i, '=');
+ }
+ long now = System.currentTimeMillis();
+ double elapsedSec = (now - startTimeMs) / 1000.0;
+ double rateMBs = elapsedSec > 0 ? current / 1_000_000.0 / elapsedSec : 0.0;
+ String rateStr = rateMBs > 0 ? String.format(Locale.US, "%.2f MB/s", rateMBs) : "NA";
+ String line = String.format(Locale.US, "\r[%s] %.1f MB processed | %s | ETA calculating...", bar, current / 1_000_000.0, rateStr);
+ System.err.print(line);
+ return;
+ }
+
+ double percent = (double) current / (double) Math.max(total, 1);
int filled = (int) Math.round(percent * barWidth);
StringBuilder bar = new StringBuilder(barWidth);
for (int i = 0; i < barWidth; i++) {
@@ -641,29 +788,17 @@
long now = System.currentTimeMillis();
double elapsedSec = (now - startTimeMs) / 1000.0;
- double rate = elapsedSec > 0 ? current / elapsedSec : 0.0; // docs/sec
- long etaSec = (rate > 0 && total > current) ? (long) Math.ceil((total - current) / rate) : 0;
+ double rateBytesPerSec = elapsedSec > 0 ? current / elapsedSec : 0.0;
+ long etaSec = (rateBytesPerSec > 0 && total > current) ? (long) Math.ceil((total - current) / rateBytesPerSec) : 0;
- String etaStr = formatDuration(etaSec);
+ String etaStr = (rateBytesPerSec > 0) ? Indexer.formatDuration(etaSec) : "NA";
String pctStr = String.format(Locale.US, "%5.1f%%", percent * 100.0);
- String rateStr = String.format(Locale.US, "%.1f/s", rate);
+ String rateStr = String.format(Locale.US, "%.2f MB/s", rateBytesPerSec / 1_000_000.0);
+ double processedMB = current / 1_000_000.0;
+ double totalMB = total / 1_000_000.0;
- String line = String.format(Locale.US, "\r[%s] %s %d/%d | %s | ETA %s", bar, pctStr, current, total, rateStr, etaStr);
+ String line = String.format(Locale.US, "\r[%s] %s %.1f/%.1f MB | %s | ETA %s", bar, pctStr, processedMB, totalMB, rateStr, etaStr);
System.err.print(line);
}
-
- private static String formatDuration (long seconds) {
- long h = seconds / 3600;
- long m = (seconds % 3600) / 60;
- long s = seconds % 60;
- if (h > 99) {
- // cap to avoid silly widths
- return String.format(Locale.US, ">99h");
- }
- if (h > 0)
- return String.format(Locale.US, "%02d:%02d:%02d", h, m, s);
- else
- return String.format(Locale.US, "%02d:%02d", m, s);
- }
}
}
diff --git a/src/test/java/de/ids_mannheim/korap/TestIndexer.java b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
index 246668b..f0c8ef5 100644
--- a/src/test/java/de/ids_mannheim/korap/TestIndexer.java
+++ b/src/test/java/de/ids_mannheim/korap/TestIndexer.java
@@ -259,14 +259,30 @@
}
String progressOutput = errStream.toString();
- // Expect progress bar renders with bracketed bar, percentage, count, and ETA
+ // Expect progress bar renders with bracketed bar, percentage, MB throughput, and ETA
assertTrue(progressOutput.contains("[==="));
assertTrue(progressOutput.contains("100.0%"));
- assertTrue(progressOutput.contains("1/1"));
+ assertTrue(progressOutput.contains("MB"));
assertTrue(progressOutput.contains("ETA"));
}
@Test
+ public void testFormatDuration () {
+ // seconds only
+ assertEquals("00:45", Indexer.formatDuration(45));
+ // minutes and seconds
+ assertEquals("05:30", Indexer.formatDuration(330));
+ // hours
+ assertEquals("02:30:00", Indexer.formatDuration(9000));
+ // exactly 24h → 1 day
+ assertEquals("1d 00:00:00", Indexer.formatDuration(86400));
+ // more than 24h
+ assertEquals("2d 03:45:12", Indexer.formatDuration(2 * 86400 + 3 * 3600 + 45 * 60 + 12));
+ // large multi-day (previously capped at >99h)
+ assertEquals("10d 05:00:00", Indexer.formatDuration(10 * 86400 + 5 * 3600));
+ }
+
+ @Test
public void testCountTargetFiles () throws Exception {
long nullCount = Indexer.countTargetFiles(null);
assertEquals(0L, nullCount);