Add ETA logging also for corpus/frequency query
Change-Id: I6bc9b65db23c1e35a0764053c2e28faa1f442655
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index dc9294b..93f5458 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -190,10 +190,127 @@
context = NULL) {
if (length(query) > 1 || length(vc) > 1) {
grid <- if (expand) expand_grid(query = query, vc = vc) else tibble(query = query, vc = vc)
- purrr::pmap(grid, function(query, vc, ...) {
- corpusQuery(kco, query = query, vc = vc, ql = ql, verbose = verbose, as.df = TRUE)
- }) %>%
- bind_rows()
+
+ # Initialize timing variables for ETA calculation
+ total_queries <- nrow(grid)
+ current_query <- 0
+ start_time <- Sys.time()
+
+ # Helper function to format duration
+ format_duration <- function(seconds) {
+ if (is.na(seconds) || seconds < 0) {
+ return("00s")
+ }
+ days <- floor(seconds / (24 * 3600))
+ seconds <- seconds %% (24 * 3600)
+ hours <- floor(seconds / 3600)
+ seconds <- seconds %% 3600
+ minutes <- floor(seconds / 60)
+ seconds <- floor(seconds %% 60)
+ paste0(
+ if (days > 0) paste0(days, "d ") else "",
+ if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
+ if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
+ paste0(sprintf("%02d", seconds), "s")
+ )
+ }
+
+ results <- purrr::pmap(grid, function(query, vc, ...) {
+ current_query <<- current_query + 1
+
+ # Execute the single query directly (avoiding recursive call)
+ contentFields <- c("snippet", "tokens")
+ query_fields <- fields
+ if (metadataOnly) {
+ query_fields <- query_fields[!query_fields %in% contentFields]
+ }
+ if (!"textSigle" %in% query_fields) {
+ query_fields <- c(query_fields, "textSigle")
+ }
+ request <-
+ paste0(
+ "?q=",
+ url_encode(enc2utf8(query)),
+ ifelse(!metadataOnly && !is.null(context) && context != "", paste0("&context=", url_encode(enc2utf8(context))), ""),
+ ifelse(vc != "", paste0("&cq=", url_encode(enc2utf8(vc))), ""),
+ ifelse(!metadataOnly, "&show-tokens=true", ""),
+ "&ql=", ql
+ )
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(
+ kco@apiUrl,
+ "search",
+ request,
+ "&fields=",
+ paste(query_fields, collapse = ","),
+ if (metadataOnly) "&access-rewrite-disabled=true" else ""
+ )
+
+ # Show individual query progress
+ log_info(verbose, "\rSearching \"", query, "\" in \"", vc, "\"", sep = "")
+ res <- apiCall(kco, paste0(requestUrl, "&count=0"))
+ if (is.null(res)) {
+ log_info(verbose, ": API call failed\n")
+ totalResults <- 0
+ } else {
+ totalResults <- as.integer(res$meta$totalResults)
+ log_info(verbose, ": ", totalResults, " hits")
+ if (!is.null(res$meta$cached)) {
+ log_info(verbose, " [cached]")
+ } else if (!is.null(res$meta$benchmark)) {
+ if (is.character(res$meta$benchmark) && grepl("s$", res$meta$benchmark)) {
+ time_value <- as.numeric(sub("s$", "", res$meta$benchmark))
+ formatted_time <- paste0(round(time_value, 2), "s")
+ log_info(verbose, ", took ", formatted_time)
+ } else {
+ log_info(verbose, ", took ", res$meta$benchmark)
+ }
+ }
+ log_info(verbose, "\n")
+ }
+
+ result <- data.frame(
+ query = query,
+ totalResults = totalResults,
+ vc = vc,
+ webUIRequestUrl = webUIRequestUrl,
+ stringsAsFactors = FALSE
+ )
+
+ # Calculate and display ETA information if verbose and we have more than one query
+ if (verbose && total_queries > 1) {
+ elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
+
+ if (current_query > 1) { # Only calculate ETA after the first query
+ avg_time_per_query <- elapsed_time / current_query
+ remaining_queries <- total_queries - current_query
+ estimated_remaining_seconds <- remaining_queries * avg_time_per_query
+ estimated_completion_time <- Sys.time() + estimated_remaining_seconds
+
+ eta_str <- format_duration(estimated_remaining_seconds)
+ completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
+
+ # Create progress display
+ progress_display <- paste0(
+ "Query ",
+ sprintf(paste0("%", nchar(total_queries), "d"), current_query),
+ "/",
+ sprintf("%d", total_queries),
+ " completed. Avg: ",
+ sprintf("%.1f", avg_time_per_query),
+ "s/query. ETA: ",
+ eta_str,
+ " (", completion_time_str, ")"
+ )
+
+ log_info(verbose, progress_display, "\n")
+ }
+ }
+
+ return(result)
+ })
+
+ results %>% bind_rows()
} else {
contentFields <- c("snippet", "tokens")
if (metadataOnly) {
diff --git a/tests/testthat/test-corpusquery-eta.R b/tests/testthat/test-corpusquery-eta.R
new file mode 100644
index 0000000..d3f3ae1
--- /dev/null
+++ b/tests/testthat/test-corpusquery-eta.R
@@ -0,0 +1,191 @@
+test_that("corpusQuery displays ETA with multiple queries", {
+ skip_if_offline() # Commented out for testing
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Use simple queries to ensure they complete quickly
+ query <- c("Test", "der")
+ vc <- c("pubDate in 2020", "pubDate in 2021")
+
+ # Capture output from corpusQuery with multiple queries
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- corpusQuery(kco, query = query, vc = vc, metadataOnly = TRUE, as.df = TRUE, expand = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from corpusQuery with multiple queries:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check that query progress is shown (format: "Query X/Y completed")
+ expect_match(
+ output_str,
+ "Query \\d+/\\d+ completed",
+ info = "Query progress counter not found in output"
+ )
+
+ # Test 2: Check that ETA is displayed (should contain digits followed by 's')
+ expect_match(
+ output_str,
+ "ETA: \\d+s",
+ info = "ETA format should show digits followed by 's'"
+ )
+
+ # Test 3: Check that completion time is shown
+ expect_match(
+ output_str,
+ "\\(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\)",
+ info = "Completion time format not found in output"
+ )
+
+ # Test 4: Check that we get results for all query combinations
+ # Note: with expand=TRUE (default), we should get length(query) * length(vc) results
+ expect_equal(nrow(result), length(query) * length(vc),
+ info = paste("Should get results for all query/vc combinations. Got:", nrow(result), "Expected:", length(query) * length(vc)))
+})
+
+test_that("corpusQuery ETA works with frequencyQuery", {
+ skip_if_offline() # Commented out for testing
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Test the exact pattern from the user's example (but smaller)
+ query <- c("macht []{0,3} Sinn", "ergibt []{0,3} Sinn")
+ years <- c(2020:2021) # Just 2 years for testing
+ as.alternatives <- TRUE
+ vc <- "textType = /Zeit.*/ & pubDate in"
+
+ # Capture output from frequencyQuery which calls corpusQuery internally
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- frequencyQuery(kco, query, paste(vc, years), as.alternatives = as.alternatives)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from frequencyQuery with ETA:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check that multiple queries are processed (format: "Query X/Y completed")
+ expect_match(
+ output_str,
+ "Query \\d+/\\d+ completed",
+ info = "Query progress should be shown for multiple queries"
+ )
+
+ # Test 2: Check that ETA is displayed when processing multiple queries
+ expect_match(
+ output_str,
+ "ETA:",
+ info = "ETA should be displayed when processing multiple queries"
+ )
+
+ # Test 3: Check that we get results
+ expect_true(nrow(result) > 0,
+ info = "Should get frequency query results")
+
+ # Test 4: Check that result has expected columns
+ expect_true(all(c("query", "vc", "totalResults") %in% names(result)),
+ info = "Result should contain expected columns")
+})
+
+test_that("corpusQuery ETA only displays with verbose=TRUE and multiple queries", {
+ skip_if_offline() # Commented out for testing
+
+ # Test with verbose=FALSE - should not show ETA
+ kco_quiet <- KorAPConnection(verbose = FALSE, cache = FALSE)
+ query <- c("Test", "der")
+ vc <- c("pubDate in 2020", "pubDate in 2021")
+
+ # Capture output with verbose=FALSE
+ temp_file <- tempfile()
+ sink(temp_file)
+ result1 <- corpusQuery(kco_quiet, query = query, vc = vc, metadataOnly = TRUE, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ output <- readLines(temp_file)
+ unlink(temp_file)
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Should not contain ETA information when verbose=FALSE
+ expect_false(grepl("ETA:", output_str),
+ info = "ETA should not be displayed when verbose=FALSE")
+
+ # Test with single query - should not show ETA even with verbose=TRUE
+ kco_verbose <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ temp_file2 <- tempfile()
+ sink(temp_file2)
+ result2 <- corpusQuery(kco_verbose, query = "Test", vc = "pubDate in 2020",
+ metadataOnly = TRUE, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ output2 <- readLines(temp_file2)
+ unlink(temp_file2)
+ output_str2 <- paste(output2, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str2 <- gsub("\\033\\[[0-9;]*m", "", output_str2)
+
+ # Should not contain ETA for single query
+ expect_false(grepl("ETA:", output_str2),
+ info = "ETA should not be displayed for single queries")
+})
+
+test_that("corpusQuery ETA format_duration function works correctly", {
+ # This tests the internal format_duration function indirectly
+ # by checking that ETA displays reasonable time formats
+ skip_if_offline() # Commented out for testing
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Use multiple queries to trigger ETA display
+ query <- c("Test", "der", "und")
+ vc <- c("pubDate in 2020", "pubDate in 2021")
+
+ # Capture output
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- corpusQuery(kco, query = query, vc = vc, metadataOnly = TRUE, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ output <- readLines(temp_file)
+ unlink(temp_file)
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Check that ETA contains reasonable time format (digits followed by 's')
+ # This indirectly tests that format_duration is working
+ expect_match(
+ output_str,
+ "ETA: \\d+s",
+ info = "ETA should display time in seconds format"
+ )
+
+ # Also check for completion time format which uses the same function
+ expect_match(
+ output_str,
+ "\\(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\)",
+ info = "Completion time should be formatted correctly"
+ )
+})