Fix ETA logging in fetchNext
Change-Id: I8fc6047f3b5c53f00c9da61641e0983cbf24f70d
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index df12f1f..45a8415 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -321,19 +321,59 @@
return(kqo)
}
use_korap_api <- Sys.getenv("USE_KORAP_API", unset = NA)
- page <- kqo@nextStartIndex / maxResultsPerPage + 1
+ # Calculate the initial page number (not used directly - keeping for reference)
collectedMatches <- kqo@collectedMatches
+ # For randomized page order, generate a list of randomized page indices
if (randomizePageOrder) {
- pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
+ # Calculate how many pages we need to fetch based on maxFetch
+ total_pages_to_fetch <- if (!is.na(maxFetch)) {
+ # Either limited by maxFetch or total results, whichever is smaller
+ min(ceiling(maxFetch / maxResultsPerPage), ceiling(kqo@totalResults / maxResultsPerPage))
+ } else {
+ # All pages
+ ceiling(kqo@totalResults / maxResultsPerPage)
+ }
+
+ # Generate randomized page indices (0-based for API)
+ pages <- sample.int(ceiling(kqo@totalResults / maxResultsPerPage), total_pages_to_fetch) - 1
+ page_index <- 1 # Index to track which page in the randomized list we're on
}
if (is.null(collectedMatches)) {
collectedMatches <- data.frame()
}
+
+ # Initialize the page counter properly based on nextStartIndex and any previously fetched results
+ # We add 1 to make it 1-based for display purposes since users expect page numbers to start from 1
+ # For first call, this will be 1, for subsequent calls, it will reflect our actual position
+ current_page_number <- ceiling(offset / maxResultsPerPage) + 1
+
+ # For sequential fetches, keep track of which global page we're on
+ # This is important for correctly showing page numbers in subsequent fetchNext calls
+ page_count_start <- current_page_number
+
repeat {
- page <- nrow(collectedMatches) %/% maxResultsPerPage + 1
- currentOffset <- ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
+ # Determine which page to fetch next
+ if (randomizePageOrder) {
+ # In randomized mode, get the page from our randomized list using the page_index
+ # Make sure we don't exceed the array bounds
+ if (page_index > length(pages)) {
+ break # No more pages to fetch in randomized mode
+ }
+ current_offset_page <- pages[page_index]
+ # For display purposes in randomized mode, show which page out of the total we're fetching
+ display_page_number <- page_index
+ } else {
+ # In sequential mode, use the current_page_number to calculate the offset
+ current_offset_page <- (current_page_number - 1)
+ display_page_number <- current_page_number
+ }
+
+ # Calculate the actual offset in tokens
+ currentOffset <- current_offset_page * maxResultsPerPage
+
+ # Build the query with the appropriate count and offset
query <- paste0(kqo@requestUrl, "&count=", min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage), "&offset=", currentOffset, "&cutoff=true")
res <- apiCall(kqo@korapConnection, query)
if (length(res$matches) == 0) {
@@ -378,17 +418,18 @@
collectedMatches <- bind_rows(collectedMatches, currentMatches)
}
- # Calculate page numbers regardless of ETA calculation
- items_per_page <- res$meta$itemsPerPage
- total_pages <- ceiling(kqo@totalResults / items_per_page)
- current_page_number <- ceiling(nrow(collectedMatches) / items_per_page)
+ # Get the actual items per page from the API response
+ # We now consistently use maxResultsPerPage instead
- # Determine the actual total pages to display, considering maxFetch
- actual_total_pages <- if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
- ceiling(maxFetch / items_per_page)
- } else {
- total_pages
- }
+ # Calculate total pages consistently using fixed maxResultsPerPage
+ # This ensures consistent page counting across the function
+ total_pages <- ceiling(kqo@totalResults / maxResultsPerPage)
+
+ # Calculate the total pages based on what we've already fetched plus what we'll fetch
+ # This ensures the correct denominator is displayed for subsequent fetchNext calls
+
+ # Calculate the total number of pages for the entire result set
+ # This calculation is kept for reference and for showing in parentheses
# Estimate remaining time
time_per_page <- NA
@@ -399,7 +440,37 @@
# benchmark looks like "0.123s"
time_per_page <- suppressWarnings(as.numeric(sub("s", "", res$meta$benchmark)))
if (!is.na(time_per_page)) {
- remaining_pages <- total_pages - current_page_number
+ # First determine our current global position for ETA calculation
+ current_global_position <- if (randomizePageOrder) {
+ # In randomized mode, this is how many pages we've processed so far in this batch
+ page_index - 1 # -1 because we're calculating remaining
+ } else {
+ page_count_start + (current_page_number - 1) - 1 # -1 because we're calculating remaining
+ }
+
+ # Calculate remaining pages based on maxFetch if specified
+ if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
+ # We need to fetch up to maxFetch results
+ remaining_items_to_fetch <- maxFetch - nrow(collectedMatches)
+ remaining_pages <- ceiling(remaining_items_to_fetch / maxResultsPerPage)
+ } else {
+ # We need to fetch all results - account for our actual global position
+ # For randomized order, calculate remaining pages based on the randomized list or maxFetch
+ if (randomizePageOrder) {
+ if (exists("pages") && length(pages) > 0) {
+ remaining_pages <- length(pages) - page_index
+ } else if (!is.na(maxFetch)) {
+ # If pages is not available, use maxFetch to estimate remaining pages
+ remaining_pages <- ceiling(maxFetch / maxResultsPerPage) - page_index
+ } else {
+ # Fallback to a reasonable default
+ remaining_pages <- 1
+ }
+ } else {
+ # For sequential order, use the current global position
+ remaining_pages <- total_pages - current_global_position
+ }
+ }
estimated_remaining_seconds <- remaining_pages * time_per_page
estimated_completion_time <- Sys.time() + estimated_remaining_seconds
@@ -407,7 +478,8 @@
# Format time nicely
format_duration <- function(seconds) {
if (is.na(seconds) || seconds < 0) {
- return("N/A")
+ # Instead of "N/A", return "00s" as a fallback
+ return("00s")
}
days <- floor(seconds / (24 * 3600))
seconds <- seconds %% (24 * 3600)
@@ -428,28 +500,90 @@
}
}
- log_info(verbose, paste0(
- "Retrieved page ",
- sprintf(paste0("%", nchar(actual_total_pages), "d"), current_page_number),
- "/",
- if (!is.na(maxFetch) && maxFetch < kqo@totalResults) {
- sprintf("%d (%d)", actual_total_pages, total_pages)
- } else {
- sprintf("%d", actual_total_pages)
- },
- if (!is.null(res$meta$cached)) {
- " [cached]\n"
- } else {
- paste0(
- " in ",
- if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
- "s. ETA: ", eta_str, " (", completion_time_str, ")",
- "\n"
- )
- }
- ))
+ # Create the page display string with proper formatting
- page <- page + 1
+ # For global page tracking, calculate the absolute page number
+ actual_display_number <- if (randomizePageOrder) {
+ current_offset_page + 1 # In randomized mode, this is the actual page (0-based + 1)
+ } else {
+ # In sequential mode, the absolute page number is the actual offset page + 1 (to make it 1-based)
+ current_offset_page + 1
+ }
+
+ # For subsequent calls to fetchNext, we need to calculate the correct page numbers
+ # based on the current batch being fetched
+
+ # For each call to fetchNext, we want to show 1/2, 2/2 (not 3/4, 4/4)
+ # Simply count from 1 within the current batch
+
+ # The relative page number is simply the current position in this batch
+ if (randomizePageOrder) {
+ relative_page_number <- page_index # In randomized mode, we start from 1 in each batch
+ } else {
+ relative_page_number <- display_page_number - (page_count_start - 1)
+ }
+
+ # How many pages will we fetch in this batch?
+ # If maxFetch is specified, calculate based on it
+ pages_in_this_batch <- if (!is.na(maxFetch)) {
+ ceiling(maxFetch / maxResultsPerPage)
+ } else {
+ # Otherwise fetch all remaining pages
+ total_pages - page_count_start + 1
+ }
+
+ # The total pages to be shown in this batch
+ batch_total_pages <- pages_in_this_batch
+
+ page_display <- paste0(
+ "Retrieved page ",
+ sprintf(paste0("%", nchar(batch_total_pages), "d"), relative_page_number),
+ "/",
+ sprintf("%d", batch_total_pages)
+ )
+
+ # If randomized, also show which actual page we fetched
+ if (randomizePageOrder) {
+ # Determine the maximum width needed for page numbers (based on total pages)
+ # This ensures consistent alignment
+ max_page_width <- nchar(as.character(total_pages))
+ # Add the actual page number that was fetched (0-based + 1 for display) with proper padding
+ page_display <- paste0(page_display,
+ sprintf(" (actual page %*d)", max_page_width, current_offset_page + 1))
+ }
+ # Always show the absolute page number and total pages (for clarity)
+ else {
+ # Show the absolute page number (out of total possible pages)
+ page_display <- paste0(page_display, sprintf(
+ " (page %d of %d total)",
+ actual_display_number, total_pages
+ ))
+ }
+
+ # Add caching or timing information
+ if (!is.null(res$meta$cached)) {
+ page_display <- paste0(page_display, " [cached]")
+ } else {
+ page_display <- paste0(
+ page_display,
+ " in ",
+ if (!is.na(time_per_page)) sprintf("%4.1f", time_per_page) else "?",
+ "s. ETA: ",
+ # Display ETA for both randomized and sequential modes
+ eta_str,
+ # Show completion time for both modes
+ paste0(" (", completion_time_str, ")")
+ )
+ }
+
+ log_info(verbose, paste0(page_display, "\n"))
+
+ # Increment the appropriate counter based on mode
+ if (randomizePageOrder) {
+ page_index <- page_index + 1
+ } else {
+ current_page_number <- current_page_number + 1
+ }
results <- results + res$meta$itemsPerPage
if (nrow(collectedMatches) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
break
diff --git a/tests/testthat/test-fetch.R b/tests/testthat/test-fetch.R
new file mode 100644
index 0000000..33d18f9
--- /dev/null
+++ b/tests/testthat/test-fetch.R
@@ -0,0 +1,22 @@
+test_that("fetchNext works with maxFetch", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ q <- kco %>% corpusQuery("Test", "pubDate since 2014", fields = c("sigle"))
+ q <- fetchNext(q, maxFetch = 75)
+ expect_equal(nrow(q@collectedMatches), 75)
+ cat("\n")
+ q <- fetchNext(q, maxFetch = 100)
+ expect_equal(nrow(q@collectedMatches), 175)
+})
+
+test_that("fetchNext works with randomizePageOrder", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ q <- kco %>% corpusQuery("Test", "pubDate since 2014", fields = c("sigle"))
+ q <- fetchNext(q, maxFetch = 175, randomizePageOrder = T)
+ expect_equal(nrow(q@collectedMatches), 175)
+ cat("\n")
+ q <- fetchNext(q, maxFetch = 50, randomizePageOrder = T)
+ expect_equal(nrow(q@collectedMatches), 225)
+})
+
diff --git a/tests/testthat/test-page-numbering.R b/tests/testthat/test-page-numbering.R
new file mode 100644
index 0000000..1ef2808
--- /dev/null
+++ b/tests/testthat/test-page-numbering.R
@@ -0,0 +1,145 @@
+# Page numbering and ETA tests
+
+test_that("page numbering is displayed correctly in sequential mode", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ q <- kco %>% corpusQuery("Test", "pubDate since 2014", fields = c("sigle"))
+
+ # Capture output - we need to use sink to capture the actual console output
+ temp_file <- tempfile()
+ sink(temp_file)
+ q <- fetchNext(q, maxFetch = 75)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console
+ cat("\nCaptured output from sequential mode:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests
+ output_str <- paste(output, collapse = "\n")
+
+ # Test 1: Check page numbering format
+ expect_match(
+ output_str,
+ "Retrieved page .+/\\d+ \\(page \\d+ of \\d+ total\\)",
+ info = "Page numbering format not found in output"
+ )
+
+ # Test 2: Check that ETA is displayed with time values (not "N/A")
+ expect_match(
+ output_str,
+ "ETA: [^N][^/][^A]", # Negative pattern to ensure "N/A" is not in the ETA
+ info = "ETA format is not correct or contains N/A"
+ )
+
+ # Test 3: Check that completion time is shown in parentheses
+ expect_match(
+ output_str,
+ "\\(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\)",
+ info = "Completion time format not found in output"
+ )
+})
+
+test_that("page numbering and ETA are displayed correctly in randomized mode", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ q <- kco %>% corpusQuery("Test", "pubDate since 2014", fields = c("sigle"))
+
+ # Set a fixed seed for reproducible tests
+ set.seed(123)
+
+ # Capture output - we need to use sink to capture the actual console output
+ temp_file <- tempfile()
+ sink(temp_file)
+ q <- fetchNext(q, maxFetch = 75, randomizePageOrder = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console
+ cat("\nCaptured output from randomized mode:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests
+ output_str <- paste(output, collapse = "\n")
+
+ # Test 1: Check page numbering format in randomized mode
+ expect_match(
+ output_str,
+ "Retrieved page .+/\\d+ \\(actual page \\d+\\)",
+ info = "Randomized page numbering format not found in output"
+ )
+
+ # Test 2: Check that ETA is displayed and doesn't contain "N/A (random order)"
+ expect_match(
+ output_str,
+ "ETA: [^N][^/][^A]", # Ensure "N/A" is not in the ETA
+ info = "ETA format is incorrect or contains N/A"
+ )
+
+ # Test 3: Check that proper time values and completion time are shown
+ expect_match(
+ output_str,
+ "ETA: \\d+s \\(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\)",
+ info = "Time format or completion time not found in output"
+ )
+})
+
+test_that("page numbering and ETA are displayed correctly in subsequent calls with randomized mode", {
+ skip_if_offline()
+ kco <- KorAPConnection(verbose = TRUE, cache = FALSE)
+ q <- kco %>% corpusQuery("Test", "pubDate since 2014", fields = c("sigle"))
+
+ # Set a fixed seed for reproducible tests
+ set.seed(123)
+
+ # First call to fetchNext (we don't need to test this part)
+ q <- fetchNext(q, maxFetch = 75, randomizePageOrder = TRUE)
+
+ # Capture output from the subsequent call
+ temp_file <- tempfile()
+ sink(temp_file)
+ q <- fetchNext(q, maxFetch = 50, randomizePageOrder = TRUE)
+ cat("\n")
+ sink()
+
+ # Read the captured output
+ output <- readLines(temp_file)
+ unlink(temp_file)
+
+ # Echo the output to console
+ cat("\nCaptured output from subsequent call with randomized mode:\n")
+ cat(paste(output, collapse = "\n"))
+
+ # Combined output string for all tests
+ output_str <- paste(output, collapse = "\n")
+
+ # Test 1: Check that page numbering format is correct and not negative
+ expect_match(
+ output_str,
+ "Retrieved page [1-9]\\d*/\\d+ \\(actual page \\d+\\)",
+ info = "Randomized page numbering format is incorrect or negative in subsequent call"
+ )
+
+ # Test 2: Check that ETA is displayed - we're now ensuring it contains digits followed by 's'
+ expect_match(
+ output_str,
+ "ETA: \\d+s",
+ info = "ETA format should show digits followed by 's'"
+ )
+
+ # Test 3: Check that completion time is shown in parentheses
+ expect_match(
+ output_str,
+ "\\(\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\)",
+ info = "Completion time not found in subsequent call output"
+ )
+})