Centralize ETA logging

Change-Id: If3661c8a9be059325d9cb86ddf4421e05ce72931
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 8097637..7789945 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -70,50 +70,24 @@
 
       # Calculate timing and ETA after first few items, using cache-aware approach
       if (i >= 2) {
-        # Use recent non-cached times for better ETA estimates
-        # Exclude very fast responses (< 0.1s) as likely cached
-        non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
+        eta_info <- calculate_sophisticated_eta(individual_times, i, total_items)
+        cache_indicator <- get_cache_indicator(eta_info$is_cached)
+        eta_display <- format_eta_display(eta_info$eta_seconds, eta_info$estimated_completion_time)
 
-        if (length(non_cached_times) >= 1) {
-          # Use median of recent non-cached times for more stable estimates
-          recent_window <- min(5, length(non_cached_times))
-          recent_times <- tail(non_cached_times, recent_window)
-          time_per_item <- median(recent_times)
-
-          remaining_items <- total_items - i
-          eta_seconds <- time_per_item * remaining_items
-          estimated_completion_time <- Sys.time() + eta_seconds
-
-          # Show current item time and cache status
-          cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
-
-          log_info(verbose, sprintf(
-            "Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
-            current_item_formatted,
-            total_items,
-            vc_display,
-            individual_times[i],
-            cache_indicator,
-            format_duration(eta_seconds),
-            format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
-          ))
-        } else {
-          # All responses so far appear cached, show without ETA
-          cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
-          log_info(verbose, sprintf(
-            "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
-            current_item_formatted,
-            total_items,
-            vc_display,
-            individual_times[i],
-            cache_indicator
-          ))
-        }
-      } else {
-        # Log first item without ETA
-        cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
         log_info(verbose, sprintf(
-          "Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
+          "Processed vc %s/%d: \"%s\" in %4.1fs%s%s\n",
+          current_item_formatted,
+          total_items,
+          vc_display,
+          individual_times[i],
+          cache_indicator,
+          eta_display
+        ))
+      } else {
+        # First item, show without ETA
+        cache_indicator <- get_cache_indicator(individual_times[i] < 0.1)
+        log_info(verbose, sprintf(
+          "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
           current_item_formatted,
           total_items,
           vc_display,
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 41d8a2b..45425b6 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -578,25 +578,7 @@
         estimated_remaining_seconds <- remaining_pages * time_per_page
         estimated_completion_time <- Sys.time() + estimated_remaining_seconds
 
-        # Format time nicely
-        format_duration <- function(seconds) {
-          if (is.na(seconds) || seconds < 0) {
-            # Instead of "N/A", return "00s" as a fallback
-            return("00s")
-          }
-          days <- floor(seconds / (24 * 3600))
-          seconds <- seconds %% (24 * 3600)
-          hours <- floor(seconds / 3600)
-          seconds <- seconds %% 3600
-          minutes <- floor(seconds / 60)
-          seconds <- floor(seconds %% 60)
-          paste0(
-            if (days > 0) paste0(days, "d ") else "",
-            if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
-            if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
-            paste0(sprintf("%02d", seconds), "s")
-          )
-        }
+        # Format time nicely using centralized function from logging.R
 
         eta_str <- format_duration(estimated_remaining_seconds)
         completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
diff --git a/R/logging.R b/R/logging.R
index 55be5cf..47c7ed5 100644
--- a/R/logging.R
+++ b/R/logging.R
@@ -4,6 +4,7 @@
 #' for progress reporting and ETA calculations.
 
 #' Log informational messages with optional coloring
+#' @importFrom stats median
 #'
 #' @param v logical flag indicating whether to output the message
 #' @param ... message components to concatenate and display
@@ -75,3 +76,81 @@
 
   paste0(". ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
 }
+
+#' Calculate sophisticated ETA using median of recent non-cached times
+#'
+#' Advanced ETA calculation that excludes cached responses and uses median
+#' of recent timing data for more stable estimates. This is particularly
+#' useful for operations where some responses may be cached and much faster.
+#'
+#' @param individual_times numeric vector of individual item processing times
+#' @param current_item current item number (1-based)
+#' @param total_items total number of items to process
+#' @param cache_threshold minimum time in seconds to consider as non-cached (default: 0.1)
+#' @param window_size number of recent non-cached times to use for median calculation (default: 5)
+#' @return list with eta_seconds, estimated_completion_time, and is_cached flag
+#' @keywords internal
+calculate_sophisticated_eta <- function(individual_times, current_item, total_items, 
+                                       cache_threshold = 0.1, window_size = 5) {
+  if (current_item < 2) {
+    return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = FALSE))
+  }
+  
+  # Get times up to current item
+  current_times <- individual_times[1:current_item]
+  current_time <- individual_times[current_item]
+  is_cached <- current_time < cache_threshold
+  
+  # Use recent non-cached times for better ETA estimates
+  # Exclude very fast responses as likely cached
+  non_cached_times <- current_times[current_times >= cache_threshold]
+  
+  if (length(non_cached_times) >= 1) {
+    # Use median of recent non-cached times for more stable estimates
+    recent_window <- min(window_size, length(non_cached_times))
+    recent_times <- tail(non_cached_times, recent_window)
+    time_per_item <- median(recent_times)
+    
+    remaining_items <- total_items - current_item
+    eta_seconds <- time_per_item * remaining_items
+    estimated_completion_time <- Sys.time() + eta_seconds
+    
+    return(list(
+      eta_seconds = eta_seconds,
+      estimated_completion_time = estimated_completion_time,
+      is_cached = is_cached
+    ))
+  } else {
+    # All responses so far appear cached
+    return(list(eta_seconds = NA, estimated_completion_time = NA, is_cached = is_cached))
+  }
+}
+
+#' Format ETA information for display
+#'
+#' Helper function to format ETA information consistently across different methods.
+#'
+#' @param eta_seconds numeric ETA in seconds (can be NA)
+#' @param estimated_completion_time POSIXct estimated completion time (can be NA)
+#' @return character string with formatted ETA or empty string if NA
+#' @keywords internal
+format_eta_display <- function(eta_seconds, estimated_completion_time) {
+  if (is.na(eta_seconds) || is.na(estimated_completion_time)) {
+    return("")
+  }
+  
+  completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
+  paste0(", ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
+}
+
+#' Get cache indicator string
+#'
+#' Helper function to generate cache indicator for logging.
+#'
+#' @param is_cached logical indicating if the item was cached
+#' @param cache_threshold minimum time threshold for non-cached items
+#' @return character string with cache indicator or empty string
+#' @keywords internal
+get_cache_indicator <- function(is_cached, cache_threshold = 0.1) {
+  if (is_cached) " [cached]" else ""
+}