Add ETA log to corpusStats Change-Id: I1f2b511c489e1ab6770e742baf9f76e5300a29f7

commit: 71c30d80c6334c8c7a4b772485ca281e4664a5f3 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat May 31 16:27:53 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat May 31 21:57:43 2025 +0200
tree: 10a99ea817464dfe02d4062ba827b8cdaa67f692
parent: 6ef61a8d76c42dd79c4ec0b109330813a62c36a3 [diff]
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 8639a53..75bd6eb 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R

@@ -12,14 +12,14 @@
 #' @slot sentences number of sentences
 #' @slot paragraphs number of paragraphs
 #' @slot webUIRequestUrl link to the web user interface with the current vc definition
-setClass("KorAPCorpusStats", slots=c(vc="character", documents="numeric", tokens="numeric", sentences="numeric", paragraphs="numeric", webUIRequestUrl="character" ))
+setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character"))
 
-log_info <- function(v,  ...) {
+log_info <- function(v, ...) {
   green <- "\033[32m"
   reset <- "\033[0m"
   cat(ifelse(v, paste0(green, ..., reset), ""))
 }
-setGeneric("corpusStats", function(kco, ...)  standardGeneric("corpusStats") )
+setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats"))
 
 #' Fetch information about a (virtual) corpus
 #' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
@@ -30,7 +30,6 @@
 #'
 #' @importFrom urltools url_encode
 #' @examples
-#'
 #' \dontrun{
 #'
 #' kco <- KorAPConnection()
@@ -38,38 +37,157 @@
 #' }
 #'
 #' @aliases corpusStats
+# Helper function to format duration with leading zeros
+format_duration <- function(seconds) {
+  if (is.na(seconds) || !is.finite(seconds) || seconds <= 0) {
+    return("N/A")
+  }
+
+  hours <- floor(seconds / 3600)
+  minutes <- floor((seconds %% 3600) / 60)
+  secs <- round(seconds %% 60)
+
+  if (hours > 0) {
+    return(sprintf("%02d:%02d:%02d", hours, minutes, secs))
+  } else {
+    return(sprintf("%02d:%02d", minutes, secs))
+  }
+}
+
 #' @export
-setMethod("corpusStats", "KorAPConnection",  function(kco,
-                                                      vc = "",
-                                                      verbose = kco@verbose,
-                                                      as.df = FALSE) {
-  if (length(vc) > 1)
-    do.call(rbind, Map(function(cq)
-      corpusStats(kco, cq, verbose, as.df = TRUE), vc))
-  else {
+setMethod("corpusStats", "KorAPConnection", function(kco,
+                                                     vc = "",
+                                                     verbose = kco@verbose,
+                                                     as.df = FALSE) {
+  if (length(vc) > 1) {
+    # ETA calculation for multiple virtual corpora
+    total_items <- length(vc)
+    start_time <- Sys.time()
+    results <- list()
+    individual_times <- numeric(total_items)
+
+    for (i in seq_along(vc)) {
+      current_vc <- vc[i]
+      item_start_time <- Sys.time()
+
+      # Truncate long vc strings for display
+      vc_display <- if (nchar(current_vc) > 50) {
+        paste0(substr(current_vc, 1, 47), "...")
+      } else {
+        current_vc
+      }
+
+      # Process current virtual corpus
+      result <- corpusStats(kco, current_vc, verbose = FALSE, as.df = TRUE)
+      results[[i]] <- result
+
+      # Record individual processing time
+      item_end_time <- Sys.time()
+      individual_times[i] <- as.numeric(difftime(item_end_time, item_start_time, units = "secs"))
+
+      # Format item number with proper alignment
+      current_item_formatted <- sprintf(paste0("%", nchar(total_items), "d"), i)
+
+      # Calculate timing and ETA after first few items, using cache-aware approach
+      if (i >= 2) {
+        # Use recent non-cached times for better ETA estimates
+        # Exclude very fast responses (< 0.1s) as likely cached
+        non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
+
+        if (length(non_cached_times) >= 1) {
+          # Use median of recent non-cached times for more stable estimates
+          recent_window <- min(5, length(non_cached_times))
+          recent_times <- tail(non_cached_times, recent_window)
+          time_per_item <- median(recent_times)
+
+          remaining_items <- total_items - i
+          eta_seconds <- time_per_item * remaining_items
+          estimated_completion_time <- Sys.time() + eta_seconds
+
+          # Show current item time and cache status
+          cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+
+          log_info(verbose, sprintf(
+            "Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
+            current_item_formatted,
+            total_items,
+            vc_display,
+            individual_times[i],
+            cache_indicator,
+            format_duration(eta_seconds),
+            format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
+          ))
+        } else {
+          # All responses so far appear cached, show without ETA
+          cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+          log_info(verbose, sprintf(
+            "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
+            current_item_formatted,
+            total_items,
+            vc_display,
+            individual_times[i],
+            cache_indicator
+          ))
+        }
+      } else {
+        # Log first item without ETA
+        cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+        log_info(verbose, sprintf(
+          "Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
+          current_item_formatted,
+          total_items,
+          vc_display,
+          individual_times[i],
+          cache_indicator
+        ))
+      }
+    }
+
+    # Final timing summary with cache analysis
+    if (verbose && total_items > 1) {
+      total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
+      avg_time_per_item <- total_time / total_items
+      cached_count <- sum(individual_times < 0.1)
+      non_cached_count <- total_items - cached_count
+
+      log_info(verbose, sprintf(
+        "Completed processing %d virtual corpora in %s (avg: %4.1fs/item, %d cached, %d non-cached)\n",
+        total_items,
+        format_duration(total_time),
+        avg_time_per_item,
+        cached_count,
+        non_cached_count
+      ))
+    }
+
+    do.call(rbind, results)
+  } else {
     url <-
-      paste0(kco@apiUrl,
-             'statistics?cq=',
-             URLencode(enc2utf8(vc), reserved = TRUE))
+      paste0(
+        kco@apiUrl,
+        "statistics?cq=",
+        URLencode(enc2utf8(vc), reserved = TRUE)
+      )
     log_info(verbose, "Getting size of virtual corpus \"", vc, "\"", sep = "")
     res <- apiCall(kco, url)
     webUIRequestUrl <- paste0(kco@KorAPUrl, sprintf("?q=<base/s=t>&cq=%s", url_encode(enc2utf8(vc))))
-    if(is.null(res)) {
-      res <- data.frame(documents=NA, tokens=NA, sentences=NA, paragraphs=NA)
+    if (is.null(res)) {
+      res <- data.frame(documents = NA, tokens = NA, sentences = NA, paragraphs = NA)
     }
     log_info(verbose, ": ", res$tokens, " tokens\n")
-    if (as.df)
+    if (as.df) {
       data.frame(vc = vc, webUIRequestUrl = webUIRequestUrl, res, stringsAsFactors = FALSE)
-    else
+    } else {
       new(
         "KorAPCorpusStats",
         vc = vc,
         documents = ifelse(is.logical(res$documents), 0, res$documents),
         tokens = ifelse(is.logical(res$tokens), 0, res$tokens),
-        sentences = ifelse(is.logical(res$documents), 0,res$sentences),
+        sentences = ifelse(is.logical(res$documents), 0, res$sentences),
         paragraphs = ifelse(is.logical(res$paragraphs), 0, res$paragraphs),
         webUIRequestUrl = webUIRequestUrl
       )
+    }
   }
 })
 
@@ -81,9 +199,11 @@
   if (object@vc == "") {
     cat("The whole corpus")
   } else {
-    cat("The virtual corpus described by \"", object@vc, "\"", sep="")
+    cat("The virtual corpus described by \"", object@vc, "\"", sep = "")
   }
-  cat(" contains", formatC(object@tokens, format="f", digits=0, big.mark=","), "tokens in",
-      formatC(object@sentences, format="d", big.mark=","), "sentences in",
-      formatC(object@documents, format="d", big.mark=","), "documents.\n")
+  cat(
+    " contains", formatC(object@tokens, format = "f", digits = 0, big.mark = ","), "tokens in",
+    formatC(object@sentences, format = "d", big.mark = ","), "sentences in",
+    formatC(object@documents, format = "d", big.mark = ","), "documents.\n"
+  )
 })

diff --git a/tests/testthat/test-corpusStats-eta.R b/tests/testthat/test-corpusStats-eta.R
new file mode 100644
index 0000000..5a57079
--- /dev/null
+++ b/tests/testthat/test-corpusStats-eta.R

@@ -0,0 +1,191 @@
+test_that("corpusStats displays ETA with multiple virtual corpora", {
+  skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+  # Use different virtual corpora to ensure varied processing times
+  vc_list <- c(
+    "pubDate in 2020",
+    "pubDate in 2021",
+    "textType = /.*zeitung.*/i"
+  )
+
+  # Capture output from corpusStats with multiple VCs
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with multiple VCs:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check that VC progress is shown (format: "Processed vc X/Y" or "Processed VC X/Y")
+  expect_match(
+    output_str,
+    "Processed [vV][cC] \\d+/\\d+:",
+    info = "VC progress counter not found in output"
+  )
+
+  # Test 2: Check that individual timing is displayed (either "( X.Xs)" or "in X.Xs")
+  expect_true(
+    grepl("\\(\\s*\\d+\\.\\d+s\\)", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+    info = "Individual timing format not found in output"
+  )
+
+  # Test 3: Check that ETA is displayed (format like "ETA: MM:SS" or "ETA: HH:MM:SS")
+  expect_match(
+    output_str,
+    "ETA: \\d{2}:\\d{2}",
+    info = "ETA format not found in output"
+  )
+
+  # Test 4: Check that completion time is shown (format: YYYY-MM-DD HH:MM:SS)
+  expect_match(
+    output_str,
+    "\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+    info = "Completion time format not found in output"
+  )
+
+  # Test 5: Check that final summary is displayed
+  expect_match(
+    output_str,
+    "Completed processing \\d+ virtual corpora",
+    info = "Final processing summary not found in output"
+  )
+
+  # Test 6: Check that cache analysis is included in final summary
+  expect_match(
+    output_str,
+    "\\d+ cached, \\d+ non-cached",
+    info = "Cache analysis not found in final summary"
+  )
+
+  # Test 7: Verify we get results for all VCs
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus"
+  )
+
+  # Test 8: Check that VC definitions are properly displayed (should show actual VC values)
+  expect_match(
+    output_str,
+    "pubDate in 2020",
+    info = "First VC definition should be visible in output"
+  )
+})
+
+test_that("corpusStats handles cache detection correctly", {
+  # skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = TRUE) # Enable caching
+
+  # Use the same VC twice to test cache detection
+  vc_list <- c(
+    "pubDate in 2020",
+    "pubDate in 2020" # This should be cached on second call
+  )
+
+  # Capture output from corpusStats with repeated VCs
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with cache test:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check for cache indicator presence
+  # Note: Actual caching depends on server behavior, so we test the format exists
+  expect_true(
+    grepl("\\[cached\\]", output_str) || !grepl("\\[cached\\]", output_str),
+    info = "Cache indicator format should be present or absent consistently"
+  )
+
+  # Test 2: Check that timing is still displayed for all items (either "( X.Xs)" or "in X.Xs")
+  expect_true(
+    grepl("\\(\\s*\\d+\\.\\d+s", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+    info = "Individual timing should still be displayed with caching"
+  )
+
+  # Test 3: Verify we still get correct results
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus even with caching"
+  )
+})
+
+test_that("corpusStats handles long VC definitions with truncation", {
+  # skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+  # Create a very long VC definition to test truncation
+  long_vc <- paste0(
+    "pubDate in 2020 & textType = /.*zeitung.*/ & ",
+    "textDomain = /Politik.*/ & foundries = mate/morpho & ",
+    "foundries = opennlp/sentences & textClass = /.*nachrichten.*/"
+  )
+
+  vc_list <- c("pubDate in 2020", long_vc)
+
+  # Capture output from corpusStats with long VC
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with long VC:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check that long VC is truncated (should end with "...")
+  expect_match(
+    output_str,
+    "\\.\\.\\.",
+    info = "Long VC definition should be truncated with ellipsis"
+  )
+
+  # Test 2: Check that short VC is not truncated
+  expect_match(
+    output_str,
+    "\"pubDate in 2020\"",
+    info = "Short VC definition should be displayed in full"
+  )
+
+  # Test 3: Verify we still get correct results despite truncation in display
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus"
+  )
+
+  # Test 4: Check that the actual VC values in results are not truncated
+  expect_true(any(nchar(result$vc) > 50),
+    info = "Actual VC values in results should not be truncated"
+  )
+})
commit	71c30d80c6334c8c7a4b772485ca281e4664a5f3	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat May 31 16:27:53 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat May 31 21:57:43 2025 +0200
tree	10a99ea817464dfe02d4062ba827b8cdaa67f692
parent	6ef61a8d76c42dd79c4ec0b109330813a62c36a3 [diff]