Centralize ETA logging
Change-Id: If3661c8a9be059325d9cb86ddf4421e05ce72931
diff --git a/NAMESPACE b/NAMESPACE
index 6b69942..5e5e2be 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -111,6 +111,7 @@
importFrom(magrittr,debug_pipe)
importFrom(purrr,map)
importFrom(purrr,pmap)
+importFrom(stats,median)
importFrom(stats,prop.test)
importFrom(stringr,str_detect)
importFrom(stringr,str_match)
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 8097637..7789945 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -70,50 +70,24 @@
# Calculate timing and ETA after first few items, using cache-aware approach
if (i >= 2) {
- # Use recent non-cached times for better ETA estimates
- # Exclude very fast responses (< 0.1s) as likely cached
- non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
+ eta_info <- calculate_sophisticated_eta(individual_times, i, total_items)
+ cache_indicator <- get_cache_indicator(eta_info$is_cached)
+ eta_display <- format_eta_display(eta_info$eta_seconds, eta_info$estimated_completion_time)
- if (length(non_cached_times) >= 1) {
- # Use median of recent non-cached times for more stable estimates
- recent_window <- min(5, length(non_cached_times))
- recent_times <- tail(non_cached_times, recent_window)
- time_per_item <- median(recent_times)
-
- remaining_items <- total_items - i
- eta_seconds <- time_per_item * remaining_items
- estimated_completion_time <- Sys.time() + eta_seconds
-
- # Show current item time and cache status
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
-
- log_info(verbose, sprintf(
- "Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
- current_item_formatted,
- total_items,
- vc_display,
- individual_times[i],
- cache_indicator,
- format_duration(eta_seconds),
- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
- ))
- } else {
- # All responses so far appear cached, show without ETA
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
- log_info(verbose, sprintf(
- "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
- current_item_formatted,
- total_items,
- vc_display,
- individual_times[i],
- cache_indicator
- ))
- }
- } else {
- # Log first item without ETA
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
log_info(verbose, sprintf(
- "Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s%s\n",
+ current_item_formatted,
+ total_items,
+ vc_display,
+ individual_times[i],
+ cache_indicator,
+ eta_display
+ ))
+ } else {
+ # First item, show without ETA
+ cache_indicator <- get_cache_indicator(individual_times[i] < 0.1)
+ log_info(verbose, sprintf(
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
current_item_formatted,
total_items,
vc_display,
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 41d8a2b..45425b6 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -578,25 +578,7 @@
estimated_remaining_seconds <- remaining_pages * time_per_page
estimated_completion_time <- Sys.time() + estimated_remaining_seconds
- # Format time nicely
- format_duration <- function(seconds) {
- if (is.na(seconds) || seconds < 0) {
- # Instead of "N/A", return "00s" as a fallback
- return("00s")
- }
- days <- floor(seconds / (24 * 3600))
- seconds <- seconds %% (24 * 3600)
- hours <- floor(seconds / 3600)
- seconds <- seconds %% 3600
- minutes <- floor(seconds / 60)
- seconds <- floor(seconds %% 60)
- paste0(
- if (days > 0) paste0(days, "d ") else "",
- if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
- if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
- paste0(sprintf("%02d", seconds), "s")
- )
- }
+ # Format time nicely using centralized function from logging.R
eta_str <- format_duration(estimated_remaining_seconds)
completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
diff --git a/R/logging.R b/R/logging.R
index 55be5cf..47c7ed5 100644
--- a/R/logging.R
+++ b/R/logging.R
@@ -4,6 +4,7 @@
#' for progress reporting and ETA calculations.
#' Log informational messages with optional coloring
+#' @importFrom stats median
#'
#' @param v logical flag indicating whether to output the message
#' @param ... message components to concatenate and display
@@ -75,3 +76,81 @@
paste0(". ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
}
+
+#' Calculate sophisticated ETA using median of recent non-cached times
+#'
+#' Advanced ETA calculation that excludes cached responses and uses median
+#' of recent timing data for more stable estimates. This is particularly
+#' useful for operations where some responses may be cached and much faster.
+#'
+#' @param individual_times numeric vector of individual item processing times
+#' @param current_item current item number (1-based)
+#' @param total_items total number of items to process
+#' @param cache_threshold minimum time in seconds to consider as non-cached (default: 0.1)
+#' @param window_size number of recent non-cached times to use for median calculation (default: 5)
+#' @return list with eta_seconds, estimated_completion_time, and is_cached flag
+#' @keywords internal
+calculate_sophisticated_eta <- function(individual_times, current_item, total_items,
+ cache_threshold = 0.1, window_size = 5) {
+ if (current_item < 2) {
+ return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = FALSE))
+ }
+
+ # Get times up to current item
+ current_times <- individual_times[1:current_item]
+ current_time <- individual_times[current_item]
+ is_cached <- current_time < cache_threshold
+
+ # Use recent non-cached times for better ETA estimates
+ # Exclude very fast responses as likely cached
+ non_cached_times <- current_times[current_times >= cache_threshold]
+
+ if (length(non_cached_times) >= 1) {
+ # Use median of recent non-cached times for more stable estimates
+ recent_window <- min(window_size, length(non_cached_times))
+ recent_times <- tail(non_cached_times, recent_window)
+ time_per_item <- median(recent_times)
+
+ remaining_items <- total_items - current_item
+ eta_seconds <- time_per_item * remaining_items
+ estimated_completion_time <- Sys.time() + eta_seconds
+
+ return(list(
+ eta_seconds = eta_seconds,
+ estimated_completion_time = estimated_completion_time,
+ is_cached = is_cached
+ ))
+ } else {
+ # All responses so far appear cached
+ return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = is_cached))
+ }
+}
+
+#' Format ETA information for display
+#'
+#' Helper function to format ETA information consistently across different methods.
+#'
+#' @param eta_seconds numeric ETA in seconds (can be NA)
+#' @param estimated_completion_time POSIXct estimated completion time (can be NA)
+#' @return character string with formatted ETA or empty string if NA
+#' @keywords internal
+format_eta_display <- function(eta_seconds, estimated_completion_time) {
+ if (is.na(eta_seconds) || is.na(estimated_completion_time)) {
+ return("")
+ }
+
+ completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
+ paste0(", ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
+}
+
+#' Get cache indicator string
+#'
+#' Helper function to generate cache indicator for logging.
+#'
+#' @param is_cached logical indicating if the item was cached
+#' @param cache_threshold minimum time threshold for non-cached items
+#' @return character string with cache indicator or empty string
+#' @keywords internal
+get_cache_indicator <- function(is_cached, cache_threshold = 0.1) {
+ if (is_cached) " [cached]" else ""
+}
diff --git a/tests/testthat/test-corpusStats-eta.R b/tests/testthat/test-corpusStats-eta.R
index f95ccaa..96d3c12 100644
--- a/tests/testthat/test-corpusStats-eta.R
+++ b/tests/testthat/test-corpusStats-eta.R
@@ -132,6 +132,71 @@
)
})
+test_that("fetchNext ETA calculation with offset works correctly", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Create a query and fetchNext with offset
+ temp_file <- tempfile()
+ sink(temp_file)
+
+ kqo <- corpusQuery(kco, 'geht', metadataOnly = TRUE)
+ result <- fetchNext(kqo, offset = 1000, maxFetch = 200)
+ cat("\n")
+
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from fetchNext with offset:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check that page numbers are reasonable (not showing huge totals like 5504)
+ if (grepl("Retrieved page", output_str)) {
+ # Extract the denominator from "Retrieved page X/Y"
+ page_match <- regmatches(output_str, regexpr("Retrieved page \\d+/(\\d+)", output_str))
+ if (length(page_match) > 0) {
+ denominator <- as.numeric(sub("Retrieved page \\d+/(\\d+)", "\\1", page_match[1]))
+ expect_true(denominator <= 10,
+ info = paste("Page denominator should be reasonable, got:", denominator))
+ }
+ }
+
+ # Test 2: Check that ETA format is present and reasonable
+ expect_match(
+ output_str,
+ "ETA: \\d+[smhd]",
+ info = "ETA should be displayed with time unit"
+ )
+
+ # Test 3: Check that completion time format is present
+ expect_match(
+ output_str,
+ "\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+ info = "Completion time should be displayed in proper format"
+ )
+
+ # Test 4: Check that completion time is reasonable (within 1 hour of current time)
+ if (grepl("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", output_str)) {
+ completion_match <- regmatches(output_str, regexpr("\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}", output_str))
+ if (length(completion_match) > 0) {
+ completion_time <- as.POSIXct(completion_match[1])
+ current_time <- Sys.time()
+ time_diff <- abs(as.numeric(difftime(completion_time, current_time, units = "hours")))
+ expect_true(time_diff <= 1,
+ info = paste("Completion time should be within 1 hour of current time, got:", time_diff, "hours"))
+ }
+ }
+})
+
test_that("corpusStats handles long VC definitions with truncation", {
# skip_if_offline()
kco <- KorAPConnection(verbose = TRUE, cache = FALSE)