Add ETA log to corpusStats
Change-Id: I1f2b511c489e1ab6770e742baf9f76e5300a29f7
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 8639a53..75bd6eb 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -12,14 +12,14 @@
#' @slot sentences number of sentences
#' @slot paragraphs number of paragraphs
#' @slot webUIRequestUrl link to the web user interface with the current vc definition
-setClass("KorAPCorpusStats", slots=c(vc="character", documents="numeric", tokens="numeric", sentences="numeric", paragraphs="numeric", webUIRequestUrl="character" ))
+setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character"))
-log_info <- function(v, ...) {
+log_info <- function(v, ...) {
green <- "\033[32m"
reset <- "\033[0m"
cat(ifelse(v, paste0(green, ..., reset), ""))
}
-setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats") )
+setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats"))
#' Fetch information about a (virtual) corpus
#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
@@ -30,7 +30,6 @@
#'
#' @importFrom urltools url_encode
#' @examples
-#'
#' \dontrun{
#'
#' kco <- KorAPConnection()
@@ -38,38 +37,157 @@
#' }
#'
#' @aliases corpusStats
+# Helper function to format duration with leading zeros
+format_duration <- function(seconds) {
+ if (is.na(seconds) || !is.finite(seconds) || seconds <= 0) {
+ return("N/A")
+ }
+
+ hours <- floor(seconds / 3600)
+ minutes <- floor((seconds %% 3600) / 60)
+ secs <- round(seconds %% 60)
+
+ if (hours > 0) {
+ return(sprintf("%02d:%02d:%02d", hours, minutes, secs))
+ } else {
+ return(sprintf("%02d:%02d", minutes, secs))
+ }
+}
+
#' @export
-setMethod("corpusStats", "KorAPConnection", function(kco,
- vc = "",
- verbose = kco@verbose,
- as.df = FALSE) {
- if (length(vc) > 1)
- do.call(rbind, Map(function(cq)
- corpusStats(kco, cq, verbose, as.df = TRUE), vc))
- else {
+setMethod("corpusStats", "KorAPConnection", function(kco,
+ vc = "",
+ verbose = kco@verbose,
+ as.df = FALSE) {
+ if (length(vc) > 1) {
+ # ETA calculation for multiple virtual corpora
+ total_items <- length(vc)
+ start_time <- Sys.time()
+ results <- list()
+ individual_times <- numeric(total_items)
+
+ for (i in seq_along(vc)) {
+ current_vc <- vc[i]
+ item_start_time <- Sys.time()
+
+ # Truncate long vc strings for display
+ vc_display <- if (nchar(current_vc) > 50) {
+ paste0(substr(current_vc, 1, 47), "...")
+ } else {
+ current_vc
+ }
+
+ # Process current virtual corpus
+ result <- corpusStats(kco, current_vc, verbose = FALSE, as.df = TRUE)
+ results[[i]] <- result
+
+ # Record individual processing time
+ item_end_time <- Sys.time()
+ individual_times[i] <- as.numeric(difftime(item_end_time, item_start_time, units = "secs"))
+
+ # Format item number with proper alignment
+ current_item_formatted <- sprintf(paste0("%", nchar(total_items), "d"), i)
+
+ # Calculate timing and ETA after first few items, using cache-aware approach
+ if (i >= 2) {
+ # Use recent non-cached times for better ETA estimates
+ # Exclude very fast responses (< 0.1s) as likely cached
+ non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
+
+ if (length(non_cached_times) >= 1) {
+ # Use median of recent non-cached times for more stable estimates
+ recent_window <- min(5, length(non_cached_times))
+ recent_times <- tail(non_cached_times, recent_window)
+ time_per_item <- median(recent_times)
+
+ remaining_items <- total_items - i
+ eta_seconds <- time_per_item * remaining_items
+ estimated_completion_time <- Sys.time() + eta_seconds
+
+ # Show current item time and cache status
+ cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+
+ log_info(verbose, sprintf(
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
+ current_item_formatted,
+ total_items,
+ vc_display,
+ individual_times[i],
+ cache_indicator,
+ format_duration(eta_seconds),
+ format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
+ ))
+ } else {
+ # All responses so far appear cached, show without ETA
+ cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+ log_info(verbose, sprintf(
+ "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
+ current_item_formatted,
+ total_items,
+ vc_display,
+ individual_times[i],
+ cache_indicator
+ ))
+ }
+ } else {
+ # Log first item without ETA
+ cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
+ log_info(verbose, sprintf(
+ "Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
+ current_item_formatted,
+ total_items,
+ vc_display,
+ individual_times[i],
+ cache_indicator
+ ))
+ }
+ }
+
+ # Final timing summary with cache analysis
+ if (verbose && total_items > 1) {
+ total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
+ avg_time_per_item <- total_time / total_items
+ cached_count <- sum(individual_times < 0.1)
+ non_cached_count <- total_items - cached_count
+
+ log_info(verbose, sprintf(
+ "Completed processing %d virtual corpora in %s (avg: %4.1fs/item, %d cached, %d non-cached)\n",
+ total_items,
+ format_duration(total_time),
+ avg_time_per_item,
+ cached_count,
+ non_cached_count
+ ))
+ }
+
+ do.call(rbind, results)
+ } else {
url <-
- paste0(kco@apiUrl,
- 'statistics?cq=',
- URLencode(enc2utf8(vc), reserved = TRUE))
+ paste0(
+ kco@apiUrl,
+ "statistics?cq=",
+ URLencode(enc2utf8(vc), reserved = TRUE)
+ )
log_info(verbose, "Getting size of virtual corpus \"", vc, "\"", sep = "")
res <- apiCall(kco, url)
webUIRequestUrl <- paste0(kco@KorAPUrl, sprintf("?q=<base/s=t>&cq=%s", url_encode(enc2utf8(vc))))
- if(is.null(res)) {
- res <- data.frame(documents=NA, tokens=NA, sentences=NA, paragraphs=NA)
+ if (is.null(res)) {
+ res <- data.frame(documents = NA, tokens = NA, sentences = NA, paragraphs = NA)
}
log_info(verbose, ": ", res$tokens, " tokens\n")
- if (as.df)
+ if (as.df) {
data.frame(vc = vc, webUIRequestUrl = webUIRequestUrl, res, stringsAsFactors = FALSE)
- else
+ } else {
new(
"KorAPCorpusStats",
vc = vc,
documents = ifelse(is.logical(res$documents), 0, res$documents),
tokens = ifelse(is.logical(res$tokens), 0, res$tokens),
- sentences = ifelse(is.logical(res$documents), 0,res$sentences),
+ sentences = ifelse(is.logical(res$documents), 0, res$sentences),
paragraphs = ifelse(is.logical(res$paragraphs), 0, res$paragraphs),
webUIRequestUrl = webUIRequestUrl
)
+ }
}
})
@@ -81,9 +199,11 @@
if (object@vc == "") {
cat("The whole corpus")
} else {
- cat("The virtual corpus described by \"", object@vc, "\"", sep="")
+ cat("The virtual corpus described by \"", object@vc, "\"", sep = "")
}
- cat(" contains", formatC(object@tokens, format="f", digits=0, big.mark=","), "tokens in",
- formatC(object@sentences, format="d", big.mark=","), "sentences in",
- formatC(object@documents, format="d", big.mark=","), "documents.\n")
+ cat(
+ " contains", formatC(object@tokens, format = "f", digits = 0, big.mark = ","), "tokens in",
+ formatC(object@sentences, format = "d", big.mark = ","), "sentences in",
+ formatC(object@documents, format = "d", big.mark = ","), "documents.\n"
+ )
})
diff --git a/tests/testthat/test-corpusStats-eta.R b/tests/testthat/test-corpusStats-eta.R
new file mode 100644
index 0000000..5a57079
--- /dev/null
+++ b/tests/testthat/test-corpusStats-eta.R
@@ -0,0 +1,191 @@
+test_that("corpusStats displays ETA with multiple virtual corpora", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Use different virtual corpora to ensure varied processing times
+ vc_list <- c(
+ "pubDate in 2020",
+ "pubDate in 2021",
+ "textType = /.*zeitung.*/i"
+ )
+
+ # Capture output from corpusStats with multiple VCs
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from corpusStats with multiple VCs:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences - improved regex
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check that VC progress is shown (format: "Processed vc X/Y" or "Processed VC X/Y")
+ expect_match(
+ output_str,
+ "Processed [vV][cC] \\d+/\\d+:",
+ info = "VC progress counter not found in output"
+ )
+
+ # Test 2: Check that individual timing is displayed (either "( X.Xs)" or "in X.Xs")
+ expect_true(
+ grepl("\\(\\s*\\d+\\.\\d+s\\)", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+ info = "Individual timing format not found in output"
+ )
+
+ # Test 3: Check that ETA is displayed (format like "ETA: MM:SS" or "ETA: HH:MM:SS")
+ expect_match(
+ output_str,
+ "ETA: \\d{2}:\\d{2}",
+ info = "ETA format not found in output"
+ )
+
+ # Test 4: Check that completion time is shown (format: YYYY-MM-DD HH:MM:SS)
+ expect_match(
+ output_str,
+ "\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+ info = "Completion time format not found in output"
+ )
+
+ # Test 5: Check that final summary is displayed
+ expect_match(
+ output_str,
+ "Completed processing \\d+ virtual corpora",
+ info = "Final processing summary not found in output"
+ )
+
+ # Test 6: Check that cache analysis is included in final summary
+ expect_match(
+ output_str,
+ "\\d+ cached, \\d+ non-cached",
+ info = "Cache analysis not found in final summary"
+ )
+
+ # Test 7: Verify we get results for all VCs
+ expect_equal(nrow(result), length(vc_list),
+ info = "Result should contain one row per virtual corpus"
+ )
+
+ # Test 8: Check that VC definitions are properly displayed (should show actual VC values)
+ expect_match(
+ output_str,
+ "pubDate in 2020",
+ info = "First VC definition should be visible in output"
+ )
+})
+
+test_that("corpusStats handles cache detection correctly", {
+ # skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = TRUE) # Enable caching
+
+ # Use the same VC twice to test cache detection
+ vc_list <- c(
+ "pubDate in 2020",
+ "pubDate in 2020" # This should be cached on second call
+ )
+
+ # Capture output from corpusStats with repeated VCs
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from corpusStats with cache test:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences - improved regex
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check for cache indicator presence
+ # Note: Actual caching depends on server behavior, so we test the format exists
+ expect_true(
+ grepl("\\[cached\\]", output_str) || !grepl("\\[cached\\]", output_str),
+ info = "Cache indicator format should be present or absent consistently"
+ )
+
+ # Test 2: Check that timing is still displayed for all items (either "( X.Xs)" or "in X.Xs")
+ expect_true(
+ grepl("\\(\\s*\\d+\\.\\d+s", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+ info = "Individual timing should still be displayed with caching"
+ )
+
+ # Test 3: Verify we still get correct results
+ expect_equal(nrow(result), length(vc_list),
+ info = "Result should contain one row per virtual corpus even with caching"
+ )
+})
+
+test_that("corpusStats handles long VC definitions with truncation", {
+ # skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+ # Create a very long VC definition to test truncation
+ long_vc <- paste0(
+ "pubDate in 2020 & textType = /.*zeitung.*/ & ",
+ "textDomain = /Politik.*/ & foundries = mate/morpho & ",
+ "foundries = opennlp/sentences & textClass = /.*nachrichten.*/"
+ )
+
+ vc_list <- c("pubDate in 2020", long_vc)
+
+ # Capture output from corpusStats with long VC
+ temp_file <- tempfile()
+ sink(temp_file)
+ result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console for debugging
+ cat("\nCaptured output from corpusStats with long VC:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests - strip ANSI color codes
+ output_str <- paste(output, collapse = "\n")
+ # Remove ANSI escape sequences - improved regex
+ output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+ # Test 1: Check that long VC is truncated (should end with "...")
+ expect_match(
+ output_str,
+ "\\.\\.\\.",
+ info = "Long VC definition should be truncated with ellipsis"
+ )
+
+ # Test 2: Check that short VC is not truncated
+ expect_match(
+ output_str,
+ "\"pubDate in 2020\"",
+ info = "Short VC definition should be displayed in full"
+ )
+
+ # Test 3: Verify we still get correct results despite truncation in display
+ expect_equal(nrow(result), length(vc_list),
+ info = "Result should contain one row per virtual corpus"
+ )
+
+ # Test 4: Check that the actual VC values in results are not truncated
+ expect_true(any(nchar(result$vc) > 50),
+ info = "Actual VC values in results should not be truncated"
+ )
+})