Add ETA log to corpusStats Change-Id: I1f2b511c489e1ab6770e742baf9f76e5300a29f7

commit: 71c30d80c6334c8c7a4b772485ca281e4664a5f3 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat May 31 16:27:53 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat May 31 21:57:43 2025 +0200
tree: 10a99ea817464dfe02d4062ba827b8cdaa67f692
parent: 6ef61a8d76c42dd79c4ec0b109330813a62c36a3 [diff] [blame]
diff --git a/tests/testthat/test-corpusStats-eta.R b/tests/testthat/test-corpusStats-eta.R
new file mode 100644
index 0000000..5a57079
--- /dev/null
+++ b/tests/testthat/test-corpusStats-eta.R

@@ -0,0 +1,191 @@
+test_that("corpusStats displays ETA with multiple virtual corpora", {
+  skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+  # Use different virtual corpora to ensure varied processing times
+  vc_list <- c(
+    "pubDate in 2020",
+    "pubDate in 2021",
+    "textType = /.*zeitung.*/i"
+  )
+
+  # Capture output from corpusStats with multiple VCs
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with multiple VCs:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check that VC progress is shown (format: "Processed vc X/Y" or "Processed VC X/Y")
+  expect_match(
+    output_str,
+    "Processed [vV][cC] \\d+/\\d+:",
+    info = "VC progress counter not found in output"
+  )
+
+  # Test 2: Check that individual timing is displayed (either "( X.Xs)" or "in X.Xs")
+  expect_true(
+    grepl("\\(\\s*\\d+\\.\\d+s\\)", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+    info = "Individual timing format not found in output"
+  )
+
+  # Test 3: Check that ETA is displayed (format like "ETA: MM:SS" or "ETA: HH:MM:SS")
+  expect_match(
+    output_str,
+    "ETA: \\d{2}:\\d{2}",
+    info = "ETA format not found in output"
+  )
+
+  # Test 4: Check that completion time is shown (format: YYYY-MM-DD HH:MM:SS)
+  expect_match(
+    output_str,
+    "\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}",
+    info = "Completion time format not found in output"
+  )
+
+  # Test 5: Check that final summary is displayed
+  expect_match(
+    output_str,
+    "Completed processing \\d+ virtual corpora",
+    info = "Final processing summary not found in output"
+  )
+
+  # Test 6: Check that cache analysis is included in final summary
+  expect_match(
+    output_str,
+    "\\d+ cached, \\d+ non-cached",
+    info = "Cache analysis not found in final summary"
+  )
+
+  # Test 7: Verify we get results for all VCs
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus"
+  )
+
+  # Test 8: Check that VC definitions are properly displayed (should show actual VC values)
+  expect_match(
+    output_str,
+    "pubDate in 2020",
+    info = "First VC definition should be visible in output"
+  )
+})
+
+test_that("corpusStats handles cache detection correctly", {
+  # skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = TRUE) # Enable caching
+
+  # Use the same VC twice to test cache detection
+  vc_list <- c(
+    "pubDate in 2020",
+    "pubDate in 2020" # This should be cached on second call
+  )
+
+  # Capture output from corpusStats with repeated VCs
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with cache test:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check for cache indicator presence
+  # Note: Actual caching depends on server behavior, so we test the format exists
+  expect_true(
+    grepl("\\[cached\\]", output_str) || !grepl("\\[cached\\]", output_str),
+    info = "Cache indicator format should be present or absent consistently"
+  )
+
+  # Test 2: Check that timing is still displayed for all items (either "( X.Xs)" or "in X.Xs")
+  expect_true(
+    grepl("\\(\\s*\\d+\\.\\d+s", output_str) || grepl("in\\s+\\d+\\.\\d+s", output_str),
+    info = "Individual timing should still be displayed with caching"
+  )
+
+  # Test 3: Verify we still get correct results
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus even with caching"
+  )
+})
+
+test_that("corpusStats handles long VC definitions with truncation", {
+  # skip_if_offline()
+  kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+
+  # Create a very long VC definition to test truncation
+  long_vc <- paste0(
+    "pubDate in 2020 & textType = /.*zeitung.*/ & ",
+    "textDomain = /Politik.*/ & foundries = mate/morpho & ",
+    "foundries = opennlp/sentences & textClass = /.*nachrichten.*/"
+  )
+
+  vc_list <- c("pubDate in 2020", long_vc)
+
+  # Capture output from corpusStats with long VC
+  temp_file <- tempfile()
+  sink(temp_file)
+  result <- corpusStats(kco, vc = vc_list, as.df = TRUE)
+  cat("\n")
+  sink()
+
+  # Read the captured output
+  output <- readLines(temp_file)
+  unlink(temp_file)
+
+  # Echo the output to console for debugging
+  cat("\nCaptured output from corpusStats with long VC:\n")
+  cat(paste(output, collapse = "\n"))
+
+  # Combined output string for all tests - strip ANSI color codes
+  output_str <- paste(output, collapse = "\n")
+  # Remove ANSI escape sequences - improved regex
+  output_str <- gsub("\\033\\[[0-9;]*m", "", output_str)
+
+  # Test 1: Check that long VC is truncated (should end with "...")
+  expect_match(
+    output_str,
+    "\\.\\.\\.",
+    info = "Long VC definition should be truncated with ellipsis"
+  )
+
+  # Test 2: Check that short VC is not truncated
+  expect_match(
+    output_str,
+    "\"pubDate in 2020\"",
+    info = "Short VC definition should be displayed in full"
+  )
+
+  # Test 3: Verify we still get correct results despite truncation in display
+  expect_equal(nrow(result), length(vc_list),
+    info = "Result should contain one row per virtual corpus"
+  )
+
+  # Test 4: Check that the actual VC values in results are not truncated
+  expect_true(any(nchar(result$vc) > 50),
+    info = "Actual VC values in results should not be truncated"
+  )
+})
commit	71c30d80c6334c8c7a4b772485ca281e4664a5f3	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat May 31 16:27:53 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat May 31 21:57:43 2025 +0200
tree	10a99ea817464dfe02d4062ba827b8cdaa67f692
parent	6ef61a8d76c42dd79c4ec0b109330813a62c36a3 [diff] [blame]