Centralize ETA logging
Change-Id: If3661c8a9be059325d9cb86ddf4421e05ce72931
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 8097637..7789945 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -70,50 +70,24 @@
# Calculate timing and ETA after first few items, using cache-aware approach
if (i >= 2) {
- # Use recent non-cached times for better ETA estimates
- # Exclude very fast responses (< 0.1s) as likely cached
- non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
+ eta_info <- calculate_sophisticated_eta(individual_times, i, total_items)
+ cache_indicator <- get_cache_indicator(eta_info$is_cached)
+ eta_display <- format_eta_display(eta_info$eta_seconds, eta_info$estimated_completion_time)
- if (length(non_cached_times) >= 1) {
- # Use median of recent non-cached times for more stable estimates
- recent_window <- min(5, length(non_cached_times))
- recent_times <- tail(non_cached_times, recent_window)
- time_per_item <- median(recent_times)
-
- remaining_items <- total_items - i
- eta_seconds <- time_per_item * remaining_items
- estimated_completion_time <- Sys.time() + eta_seconds
-
- # Show current item time and cache status
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
-
- log_info(verbose, sprintf(
- "Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
- current_item_formatted,
- total_items,
- vc_display,
- individual_times[i],
- cache_indicator,
- format_duration(eta_seconds),
- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
- ))
- } else {
- # All responses so far appear cached, show without ETA
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
- log_info(verbose, sprintf(
- "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
- current_item_formatted,
- total_items,
- vc_display,
- individual_times[i],
- cache_indicator
- ))
- }
- } else {
- # Log first item without ETA
- cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
log_info(verbose, sprintf(
- "Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s%s\n",
+ current_item_formatted,
+ total_items,
+ vc_display,
+ individual_times[i],
+ cache_indicator,
+ eta_display
+ ))
+ } else {
+ # First item, show without ETA
+ cache_indicator <- get_cache_indicator(individual_times[i] < 0.1)
+ log_info(verbose, sprintf(
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
current_item_formatted,
total_items,
vc_display,
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 41d8a2b..45425b6 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -578,25 +578,7 @@
estimated_remaining_seconds <- remaining_pages * time_per_page
estimated_completion_time <- Sys.time() + estimated_remaining_seconds
- # Format time nicely
- format_duration <- function(seconds) {
- if (is.na(seconds) || seconds < 0) {
- # Instead of "N/A", return "00s" as a fallback
- return("00s")
- }
- days <- floor(seconds / (24 * 3600))
- seconds <- seconds %% (24 * 3600)
- hours <- floor(seconds / 3600)
- seconds <- seconds %% 3600
- minutes <- floor(seconds / 60)
- seconds <- floor(seconds %% 60)
- paste0(
- if (days > 0) paste0(days, "d ") else "",
- if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
- if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
- paste0(sprintf("%02d", seconds), "s")
- )
- }
+ # Format time nicely using centralized function from logging.R
eta_str <- format_duration(estimated_remaining_seconds)
completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
diff --git a/R/logging.R b/R/logging.R
index 55be5cf..47c7ed5 100644
--- a/R/logging.R
+++ b/R/logging.R
@@ -4,6 +4,7 @@
#' for progress reporting and ETA calculations.
#' Log informational messages with optional coloring
+#' @importFrom stats median
#'
#' @param v logical flag indicating whether to output the message
#' @param ... message components to concatenate and display
@@ -75,3 +76,81 @@
paste0(". ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
}
+
+#' Calculate sophisticated ETA using median of recent non-cached times
+#'
+#' Advanced ETA calculation that excludes cached responses and uses median
+#' of recent timing data for more stable estimates. This is particularly
+#' useful for operations where some responses may be cached and much faster.
+#'
+#' @param individual_times numeric vector of individual item processing times
+#' @param current_item current item number (1-based)
+#' @param total_items total number of items to process
+#' @param cache_threshold minimum time in seconds to consider as non-cached (default: 0.1)
+#' @param window_size number of recent non-cached times to use for median calculation (default: 5)
+#' @return list with eta_seconds, estimated_completion_time, and is_cached flag
+#' @keywords internal
+calculate_sophisticated_eta <- function(individual_times, current_item, total_items,
+ cache_threshold = 0.1, window_size = 5) {
+ if (current_item < 2) {
+ return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = FALSE))
+ }
+
+ # Get times up to current item
+ current_times <- individual_times[1:current_item]
+ current_time <- individual_times[current_item]
+ is_cached <- current_time < cache_threshold
+
+ # Use recent non-cached times for better ETA estimates
+ # Exclude very fast responses as likely cached
+ non_cached_times <- current_times[current_times >= cache_threshold]
+
+ if (length(non_cached_times) >= 1) {
+ # Use median of recent non-cached times for more stable estimates
+ recent_window <- min(window_size, length(non_cached_times))
+ recent_times <- tail(non_cached_times, recent_window)
+ time_per_item <- median(recent_times)
+
+ remaining_items <- total_items - current_item
+ eta_seconds <- time_per_item * remaining_items
+ estimated_completion_time <- Sys.time() + eta_seconds
+
+ return(list(
+ eta_seconds = eta_seconds,
+ estimated_completion_time = estimated_completion_time,
+ is_cached = is_cached
+ ))
+ } else {
+ # All responses so far appear cached
+ return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = is_cached))
+ }
+}
+
+#' Format ETA information for display
+#'
+#' Helper function to format ETA information consistently across different methods.
+#'
+#' @param eta_seconds numeric ETA in seconds (can be NA)
+#' @param estimated_completion_time POSIXct estimated completion time (can be NA)
+#' @return character string with formatted ETA or empty string if NA
+#' @keywords internal
+format_eta_display <- function(eta_seconds, estimated_completion_time) {
+ if (is.na(eta_seconds) || is.na(estimated_completion_time)) {
+ return("")
+ }
+
+ completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
+ paste0(", ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
+}
+
+#' Get cache indicator string
+#'
+#' Helper function to generate cache indicator for logging.
+#'
+#' @param is_cached logical indicating if the item was cached
+#' @param cache_threshold minimum time threshold for non-cached items
+#' @return character string with cache indicator or empty string
+#' @keywords internal
+get_cache_indicator <- function(is_cached, cache_threshold = 0.1) {
+ if (is_cached) " [cached]" else ""
+}