Factor out ETA logging
Change-Id: Ic8b1600395018aa79701e29a38c4ec22d598337a
diff --git a/DESCRIPTION b/DESCRIPTION
index a25074b..1e47e7a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -77,4 +77,5 @@
'misc.R'
'reexports.R'
'textMetadata.R'
+ 'logging.R'
Roxygen: list(markdown = TRUE)
diff --git a/R/KorAPConnection.R b/R/KorAPConnection.R
index f2d65a8..1d567a8 100644
--- a/R/KorAPConnection.R
+++ b/R/KorAPConnection.R
@@ -13,6 +13,7 @@
#' @import R.cache
#' @import utils
#' @import methods
+#' @include logging.R
#' @slot KorAPUrl URL of the web user interface of the KorAP server used in the connection.
#' @slot apiVersion requested KorAP API version.
#' @slot indexRevision indexRevision code as reported from API via `X-Index-Revision` HTTP header.
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 75bd6eb..8097637 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -4,6 +4,7 @@
#' `KorAPCorpusStats` objects can be obtained by the [corpusStats()] method.
#'
#' @include KorAPConnection.R
+#' @include logging.R
#'
#' @export
#' @slot vc definition of the virtual corpus
@@ -14,11 +15,6 @@
#' @slot webUIRequestUrl link to the web user interface with the current vc definition
setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character"))
-log_info <- function(v, ...) {
- green <- "\033[32m"
- reset <- "\033[0m"
- cat(ifelse(v, paste0(green, ..., reset), ""))
-}
setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats"))
#' Fetch information about a (virtual) corpus
@@ -37,22 +33,6 @@
#' }
#'
#' @aliases corpusStats
-# Helper function to format duration with leading zeros
-format_duration <- function(seconds) {
- if (is.na(seconds) || !is.finite(seconds) || seconds <= 0) {
- return("N/A")
- }
-
- hours <- floor(seconds / 3600)
- minutes <- floor((seconds %% 3600) / 60)
- secs <- round(seconds %% 60)
-
- if (hours > 0) {
- return(sprintf("%02d:%02d:%02d", hours, minutes, secs))
- } else {
- return(sprintf("%02d:%02d", minutes, secs))
- }
-}
#' @export
setMethod("corpusStats", "KorAPConnection", function(kco,
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 93f5458..04d859a 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -5,6 +5,7 @@
#' represent the current state of a query to a KorAP server.
#'
#' @include KorAPConnection.R
+#' @include logging.R
#' @import httr2
#'
#' @include RKorAPClient-package.R
@@ -196,25 +197,6 @@
current_query <- 0
start_time <- Sys.time()
- # Helper function to format duration
- format_duration <- function(seconds) {
- if (is.na(seconds) || seconds < 0) {
- return("00s")
- }
- days <- floor(seconds / (24 * 3600))
- seconds <- seconds %% (24 * 3600)
- hours <- floor(seconds / 3600)
- seconds <- seconds %% 3600
- minutes <- floor(seconds / 60)
- seconds <- floor(seconds %% 60)
- paste0(
- if (days > 0) paste0(days, "d ") else "",
- if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
- if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
- paste0(sprintf("%02d", seconds), "s")
- )
- }
-
results <- purrr::pmap(grid, function(query, vc, ...) {
current_query <<- current_query + 1
@@ -279,16 +261,10 @@
# Calculate and display ETA information if verbose and we have more than one query
if (verbose && total_queries > 1) {
- elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
-
- if (current_query > 1) { # Only calculate ETA after the first query
+ eta_info <- calculate_eta(current_query, total_queries, start_time)
+ if (eta_info != "") {
+ elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
avg_time_per_query <- elapsed_time / current_query
- remaining_queries <- total_queries - current_query
- estimated_remaining_seconds <- remaining_queries * avg_time_per_query
- estimated_completion_time <- Sys.time() + estimated_remaining_seconds
-
- eta_str <- format_duration(estimated_remaining_seconds)
- completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
# Create progress display
progress_display <- paste0(
@@ -298,9 +274,8 @@
sprintf("%d", total_queries),
" completed. Avg: ",
sprintf("%.1f", avg_time_per_query),
- "s/query. ETA: ",
- eta_str,
- " (", completion_time_str, ")"
+ "s/query",
+ eta_info
)
log_info(verbose, progress_display, "\n")
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index 3d43012..16987c6 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -1,4 +1,5 @@
-setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
+#' @include logging.R
+setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
#' Collocation analysis
#'
@@ -52,10 +53,12 @@
#' @examples
#' \dontrun{
#'
-#' # Find top collocates of "Packung" inside and outside the sports domain.
-#' KorAPConnection(verbose = TRUE) |>
-#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
-#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) |>
+#' # Find top collocates of "Packung" inside and outside the sports domain.
+#' KorAPConnection(verbose = TRUE) |>
+#' collocationAnalysis("Packung",
+#' vc = c("textClass=sport", "textClass!=sport"),
+#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
+#' ) |>
#' dplyr::filter(logDice >= 5)
#' }
#'
@@ -65,150 +68,154 @@
#' # Note that, currently, the use of focus function disallows exactFrequencies.
#' KorAPConnection(verbose = TRUE) |>
#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
-#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
+#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
+#' )
#' }
#'
#' @export
-setMethod("collocationAnalysis", "KorAPConnection",
- function(kco,
- node,
- vc = "",
- lemmatizeNodeQuery = FALSE,
- minOccur = 5,
- leftContextSize = 5,
- rightContextSize = 5,
- topCollocatesLimit = 200,
- searchHitsSampleLimit = 20000,
- ignoreCollocateCase = FALSE,
- withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
- exactFrequencies = TRUE,
- stopwords = append(RKorAPClient::synsemanticStopwords(), node),
- seed = 7,
- expand = length(vc) != length(node),
- maxRecurse = 0,
- addExamples = FALSE,
- thresholdScore = "logDice",
- threshold = 2.0,
- localStopwords = c(),
- collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
- ...) {
- # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
- word <- frequency <- O <- NULL
+setMethod(
+ "collocationAnalysis", "KorAPConnection",
+ function(kco,
+ node,
+ vc = "",
+ lemmatizeNodeQuery = FALSE,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ topCollocatesLimit = 200,
+ searchHitsSampleLimit = 20000,
+ ignoreCollocateCase = FALSE,
+ withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
+ exactFrequencies = TRUE,
+ stopwords = append(RKorAPClient::synsemanticStopwords(), node),
+ seed = 7,
+ expand = length(vc) != length(node),
+ maxRecurse = 0,
+ addExamples = FALSE,
+ thresholdScore = "logDice",
+ threshold = 2.0,
+ localStopwords = c(),
+ collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
+ ...) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ word <- frequency <- O <- NULL
- if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
- stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
- }
+ if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
+ stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
+ }
- warnIfNotAuthorized(kco)
+ warnIfNotAuthorized(kco)
- if (lemmatizeNodeQuery) {
- node <- lemmatizeWordQuery(node)
- }
+ if (lemmatizeNodeQuery) {
+ node <- lemmatizeWordQuery(node)
+ }
- result <- if (length(node) > 1 || length(vc) > 1) {
- grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
- purrr::pmap(grid, function(node, vc, ...)
- collocationAnalysis(kco,
- node =node,
- vc = vc,
- minOccur = minOccur,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- topCollocatesLimit = topCollocatesLimit,
- searchHitsSampleLimit = searchHitsSampleLimit,
- ignoreCollocateCase = ignoreCollocateCase,
- withinSpan = withinSpan,
- exactFrequencies = exactFrequencies,
- stopwords = stopwords,
- addExamples = TRUE,
- localStopwords = localStopwords,
- seed = seed,
- expand = expand,
- ...) ) |>
- bind_rows()
- } else {
- set.seed(seed)
- candidates <- collocatesQuery(
- kco,
- node,
- vc = vc,
- minOccur = minOccur,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- searchHitsSampleLimit = searchHitsSampleLimit,
- ignoreCollocateCase = ignoreCollocateCase,
- stopwords = append(stopwords, localStopwords),
- ...
- )
+ result <- if (length(node) > 1 || length(vc) > 1) {
+ grid <- if (expand) expand_grid(node = node, vc = vc) else tibble(node = node, vc = vc)
+ purrr::pmap(grid, function(node, vc, ...) {
+ collocationAnalysis(kco,
+ node = node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ topCollocatesLimit = topCollocatesLimit,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ exactFrequencies = exactFrequencies,
+ stopwords = stopwords,
+ addExamples = TRUE,
+ localStopwords = localStopwords,
+ seed = seed,
+ expand = expand,
+ ...
+ )
+ }) |>
+ bind_rows()
+ } else {
+ set.seed(seed)
+ candidates <- collocatesQuery(
+ kco,
+ node,
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = append(stopwords, localStopwords),
+ ...
+ )
- if (nrow(candidates) > 0) {
- candidates <- candidates |>
- filter(frequency >= minOccur) |>
- slice_head(n=topCollocatesLimit)
- collocationScoreQuery(
- kco,
- node = node,
- collocate = candidates$word,
- vc = vc,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- observed = if (exactFrequencies) NA else candidates$frequency,
- ignoreCollocateCase = ignoreCollocateCase,
- withinSpan = withinSpan,
- ...
- ) |>
- filter(O >= minOccur) |>
- dplyr::arrange(dplyr::desc(logDice))
- } else {
- tibble()
- }
- }
- if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
- recurseWith <- result |>
- filter(!!as.name(thresholdScore) >= threshold)
- result <- collocationAnalysis(
- kco,
- node = paste0("(", buildCollocationQuery(
- removeWithinSpan(recurseWith$node, withinSpan),
- recurseWith$collocate,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- withinSpan = ""
- ), ")"),
- vc = vc,
- minOccur = minOccur,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- withinSpan = withinSpan,
- maxRecurse = maxRecurse - 1,
- stopwords = stopwords,
- localStopwords = recurseWith$collocate,
- exactFrequencies = exactFrequencies,
- searchHitsSampleLimit = searchHitsSampleLimit,
- topCollocatesLimit = topCollocatesLimit,
- addExamples = FALSE
- ) |>
- bind_rows(result) |>
- filter(logDice >= 2) |>
- filter(.$O >= minOccur) |>
- dplyr::arrange(dplyr::desc(logDice))
- }
- if (addExamples && length(result) > 0) {
- result$query <-buildCollocationQuery(
- result$node,
- result$collocate,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- withinSpan = withinSpan
- )
- result$example <- findExample(
- kco,
- query = result$query,
- vc = result$vc
- )
- }
- result
- }
+ if (nrow(candidates) > 0) {
+ candidates <- candidates |>
+ filter(frequency >= minOccur) |>
+ slice_head(n = topCollocatesLimit)
+ collocationScoreQuery(
+ kco,
+ node = node,
+ collocate = candidates$word,
+ vc = vc,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ observed = if (exactFrequencies) NA else candidates$frequency,
+ ignoreCollocateCase = ignoreCollocateCase,
+ withinSpan = withinSpan,
+ ...
+ ) |>
+ filter(O >= minOccur) |>
+ dplyr::arrange(dplyr::desc(logDice))
+ } else {
+ tibble()
+ }
+ }
+ if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
+ recurseWith <- result |>
+ filter(!!as.name(thresholdScore) >= threshold)
+ result <- collocationAnalysis(
+ kco,
+ node = paste0("(", buildCollocationQuery(
+ removeWithinSpan(recurseWith$node, withinSpan),
+ recurseWith$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = ""
+ ), ")"),
+ vc = vc,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = withinSpan,
+ maxRecurse = maxRecurse - 1,
+ stopwords = stopwords,
+ localStopwords = recurseWith$collocate,
+ exactFrequencies = exactFrequencies,
+ searchHitsSampleLimit = searchHitsSampleLimit,
+ topCollocatesLimit = topCollocatesLimit,
+ addExamples = FALSE
+ ) |>
+ bind_rows(result) |>
+ filter(logDice >= 2) |>
+ filter(.$O >= minOccur) |>
+ dplyr::arrange(dplyr::desc(logDice))
+ }
+ if (addExamples && length(result) > 0) {
+ result$query <- buildCollocationQuery(
+ result$node,
+ result$collocate,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ withinSpan = withinSpan
+ )
+ result$example <- findExample(
+ kco,
+ query = result$query,
+ vc = result$vc
+ )
+ }
+ result
+ }
)
# #' @export
@@ -217,9 +224,9 @@
return(query)
}
needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
- res <- gsub(needle, '\\1', query)
+ res <- gsub(needle, "\\1", query)
needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
- res <- gsub(needle, '\\1', res)
+ res <- gsub(needle, "\\1", res)
return(res)
}
@@ -234,20 +241,21 @@
rightContextSize = 5,
ignoreCollocateCase = FALSE,
stopwords = c(),
- collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
+ collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
verbose = TRUE) {
word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
frequency <- NULL
if (nrow(matches) < 1) {
- dplyr::tibble(word=c(), frequency=c())
+ dplyr::tibble(word = c(), frequency = c())
} else if (index == 0) {
- if (! "tokens" %in% colnames(matches) || ! is.list(matches$tokens)) {
+ if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
- return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize, ignoreCollocateCase = ignoreCollocateCase,
- stopwords = stopwords, oldTable = oldTable, verbose = verbose))
-
+ return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords, oldTable = oldTable, verbose = verbose
+ ))
}
log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
for (i in 1:nrow(matches)) {
@@ -262,30 +270,30 @@
)
}
log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
- oldTable |>
+ oldTable |>
group_by(word) |>
mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
- summarise(frequency=sum(frequency), .groups = "drop") |>
+ summarise(frequency = sum(frequency), .groups = "drop") |>
arrange(desc(frequency))
} else {
- stopwordsTable <- dplyr::tibble(word=stopwords)
+ stopwordsTable <- dplyr::tibble(word = stopwords)
left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
-# cat(paste("left:", left, "\n", collapse=" "))
+ # cat(paste("left:", left, "\n", collapse=" "))
right <- head(unlist(matches$tokens$right[index]), rightContextSize)
-# cat(paste("right:", right, "\n", collapse=" "))
+ # cat(paste("right:", right, "\n", collapse=" "))
- if(length(left) + length(right) == 0) {
+ if (length(left) + length(right) == 0) {
oldTable
} else {
table(c(left, right)) |>
dplyr::as_tibble(.name_repair = "minimal") |>
dplyr::rename(word = 1, frequency = 2) |>
dplyr::filter(str_detect(word, collocateFilterRegex)) |>
- dplyr::anti_join(stopwordsTable, by="word") |>
+ dplyr::anti_join(stopwordsTable, by = "word") |>
dplyr::bind_rows(oldTable)
}
}
@@ -302,14 +310,14 @@
ignoreCollocateCase = FALSE,
stopwords = c(),
tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")",
- collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
+ collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
verbose = TRUE) {
word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
frequency <- NULL
if (length(snippet) < 1) {
- dplyr::tibble(word=c(), frequency=c())
+ dplyr::tibble(word = c(), frequency = c())
} else if (length(snippet) > 1) {
log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
for (s in snippet) {
@@ -323,39 +331,41 @@
)
}
log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
- oldTable |>
+ oldTable |>
group_by(word) |>
mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
- summarise(frequency=sum(frequency), .groups = "drop") |>
+ summarise(frequency = sum(frequency), .groups = "drop") |>
arrange(desc(frequency))
} else {
- stopwordsTable <- dplyr::tibble(word=stopwords)
+ stopwordsTable <- dplyr::tibble(word = stopwords)
match <-
str_match(
snippet,
'<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
)
- left <- if(leftContextSize > 0)
+ left <- if (leftContextSize > 0) {
tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
- else
+ } else {
""
-# cat(paste("left:", left, "\n", collapse=" "))
+ }
+ # cat(paste("left:", left, "\n", collapse=" "))
- right <- if(rightContextSize > 0)
+ right <- if (rightContextSize > 0) {
head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
- else
- ""
-# cat(paste("right:", right, "\n", collapse=" "))
+ } else {
+ ""
+ }
+ # cat(paste("right:", right, "\n", collapse=" "))
- if(is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
+ if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
oldTable
} else {
table(c(left, right)) |>
dplyr::as_tibble(.name_repair = "minimal") |>
dplyr::rename(word = 1, frequency = 2) |>
dplyr::filter(str_detect(word, collocateFilterRegex)) |>
- dplyr::anti_join(stopwordsTable, by="word") |>
+ dplyr::anti_join(stopwordsTable, by = "word") |>
dplyr::bind_rows(oldTable)
}
}
@@ -442,21 +452,22 @@
matchOnly = TRUE) {
out <- character(length = length(query))
- if (length(vc) < length(query))
+ if (length(vc) < length(query)) {
vc <- rep(vc, length(query))
+ }
for (i in seq_along(query)) {
q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
if (q@totalResults > 0) {
- q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
+ q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
example <- as.character((q@collectedMatches)$snippet[1])
- out[i] <- if(matchOnly) {
- gsub('.*<mark>(.+)</mark>.*', '\\1', example)
+ out[i] <- if (matchOnly) {
+ gsub(".*<mark>(.+)</mark>.*", "\\1", example)
} else {
- stringr::str_replace(example, '<[^>]*>', '')
+ stringr::str_replace(example, "<[^>]*>", "")
}
} else {
- out[i] = ""
+ out[i] <- ""
}
}
out
@@ -475,19 +486,20 @@
...) {
frequency <- NULL
q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
- if(q@totalResults == 0) {
- tibble(word=c(), frequency=c())
+ if (q@totalResults == 0) {
+ tibble(word = c(), frequency = c())
} else {
- q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
- matches2FreqTable (q@collectedMatches,
- 0,
- minOccur = minOccur,
- leftContextSize = leftContextSize,
- rightContextSize = rightContextSize,
- ignoreCollocateCase = ignoreCollocateCase,
- stopwords = stopwords,
- ...,
- verbose = kco@verbose) |>
+ q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
+ matches2FreqTable(q@collectedMatches,
+ 0,
+ minOccur = minOccur,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords,
+ ...,
+ verbose = kco@verbose
+ ) |>
mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
filter(frequency >= minOccur)
}
diff --git a/R/logging.R b/R/logging.R
new file mode 100644
index 0000000..55be5cf
--- /dev/null
+++ b/R/logging.R
@@ -0,0 +1,77 @@
+#' Logging utilities for RKorAPClient
+#'
+#' This module provides centralized logging functions used throughout the package
+#' for progress reporting and ETA calculations.
+
+#' Log informational messages with optional coloring
+#'
+#' @param v logical flag indicating whether to output the message
+#' @param ... message components to concatenate and display
+#' @keywords internal
+log_info <- function(v, ...) {
+ green <- "\033[32m"
+ reset <- "\033[0m"
+ cat(ifelse(v, paste0(green, ..., reset), ""))
+}
+
+#' Format duration in seconds to human-readable format
+#'
+#' Converts a duration in seconds to a formatted string with days, hours, minutes, and seconds.
+#' Used for ETA calculations and progress reporting.
+#'
+#' @param seconds numeric duration in seconds
+#' @return character string with formatted duration
+#' @keywords internal
+#' @examples
+#' \dontrun{
+#' format_duration(3661) # "01h 01m 01s"
+#' format_duration(86461) # "1d 00h 01m 01s"
+#' }
+format_duration <- function(seconds) {
+ if (is.na(seconds) || !is.finite(seconds) || seconds < 0) {
+ return("00s")
+ }
+
+ days <- floor(seconds / (24 * 3600))
+ seconds <- seconds %% (24 * 3600)
+ hours <- floor(seconds / 3600)
+ seconds <- seconds %% 3600
+ minutes <- floor(seconds / 60)
+ seconds <- floor(seconds %% 60)
+
+ paste0(
+ if (days > 0) paste0(days, "d ") else "",
+ if (hours > 0 || days > 0) paste0(sprintf("%02d", hours), "h ") else "",
+ if (minutes > 0 || hours > 0 || days > 0) paste0(sprintf("%02d", minutes), "m ") else "",
+ paste0(sprintf("%02d", seconds), "s")
+ )
+}
+
+#' Calculate and format ETA for batch operations
+#'
+#' Helper function to calculate estimated time of arrival based on elapsed time
+#' and progress through a batch operation.
+#'
+#' @param current_item current item number (1-based)
+#' @param total_items total number of items to process
+#' @param start_time POSIXct start time of the operation
+#' @return character string with formatted ETA and completion time or empty string if not calculable
+#' @keywords internal
+calculate_eta <- function(current_item, total_items, start_time) {
+ if (current_item <= 1 || total_items <= 1) {
+ return("")
+ }
+
+ elapsed_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
+ if (elapsed_time <= 0) {
+ return("")
+ }
+
+ avg_time_per_item <- elapsed_time / (current_item - 1)
+ remaining_items <- total_items - current_item + 1
+ eta_seconds <- avg_time_per_item * remaining_items
+ estimated_completion_time <- Sys.time() + eta_seconds
+ completion_time_str <- format(estimated_completion_time, "%Y-%m-%d %H:%M:%S")
+
+ paste0(". ETA: ", format_duration(eta_seconds), " (", completion_time_str, ")")
+}
diff --git a/R/textMetadata.R b/R/textMetadata.R
index a6f00ce..8984967 100644
--- a/R/textMetadata.R
+++ b/R/textMetadata.R
@@ -1,4 +1,5 @@
-setGeneric("textMetadata", function(kco, ...) standardGeneric("textMetadata") )
+#' @include logging.R
+setGeneric("textMetadata", function(kco, ...) standardGeneric("textMetadata"))
#' Retrieve metadata for a text, identified by its sigle (id)
#'
@@ -27,48 +28,53 @@
#' }
#'
#' @export
-setMethod("textMetadata", "KorAPConnection",
- function(kco, textSigle, verbose = kco@verbose) {
- # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
- key <- 0
- if (length(textSigle) > 1)
- do.call(bind_rows, Map(function(atomicSigle)
- textMetadata(kco, atomicSigle), textSigle))
- else {
- url <-
- paste0(kco@apiUrl, 'corpus/',
- URLencode(enc2utf8(textSigle), reserved = TRUE))
- log_info(verbose, "Getting metadata for ", textSigle, sep = "")
- res <- apiCall(kco, url)
- log_info(verbose, ifelse(is.null(res) || "errors" %in% names(res), " [error]\n", "\n"))
-
- if(is.null(res)) {
- res <- tibble(errors="API request failed")
+setMethod(
+ "textMetadata", "KorAPConnection",
+ function(kco, textSigle, verbose = kco@verbose) {
+ # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ key <- 0
+ if (length(textSigle) > 1) {
+ do.call(bind_rows, Map(function(atomicSigle) {
+ textMetadata(kco, atomicSigle)
+ }, textSigle))
} else {
- if ("document" %in% names(res) & "fields" %in% names(res$document) && length(res$document$fields) > 0) {
- res <- as_tibble(res$document$fields) %>%
- dplyr::mutate(across(where(is.list), ~ purrr::map(.x, ~ if (length(.x) < 2) unlist(.x) else paste(.x, collapse = "\\t")))) %>%
- select(key, value) %>%
- tidyr::pivot_wider(names_from = key, values_from = value, names_repair = "unique") %>%
- mutate(
- textSigle = as.character(textSigle),
- requestUrl = url,
- webUIRequestUrl = paste0(kco@KorAPUrl, sprintf('?q=<base/s=t>&cq=textSigle+%%3D+"%s"', url_encode(enc2utf8(textSigle))))) %>%
- mutate(across(everything(), as.character)) %>%
- relocate(textSigle)
+ url <-
+ paste0(
+ kco@apiUrl, "corpus/",
+ URLencode(enc2utf8(textSigle), reserved = TRUE)
+ )
+ log_info(verbose, "Getting metadata for ", textSigle, sep = "")
+ res <- apiCall(kco, url)
+ log_info(verbose, ifelse(is.null(res) || "errors" %in% names(res), " [error]\n", "\n"))
+
+ if (is.null(res)) {
+ res <- tibble(errors = "API request failed")
} else {
- res <- lapply(res, function(x) paste0(x, collapse = "\\t")) # flatten list
- res <- as_tibble(res) %>%
- head(n=1) %>%
- mutate(
- requestUrl = url,
- textSigle = textSigle,
- webUIRequestUrl = paste0(kco@KorAPUrl, sprintf('?q=<base/s=t>&cq=textSigle+%%3D+"%s"', url_encode(enc2utf8(textSigle))))) %>%
- relocate(textSigle)
+ if ("document" %in% names(res) & "fields" %in% names(res$document) && length(res$document$fields) > 0) {
+ res <- as_tibble(res$document$fields) %>%
+ dplyr::mutate(across(where(is.list), ~ purrr::map(.x, ~ if (length(.x) < 2) unlist(.x) else paste(.x, collapse = "\\t")))) %>%
+ select(key, value) %>%
+ tidyr::pivot_wider(names_from = key, values_from = value, names_repair = "unique") %>%
+ mutate(
+ textSigle = as.character(textSigle),
+ requestUrl = url,
+ webUIRequestUrl = paste0(kco@KorAPUrl, sprintf('?q=<base/s=t>&cq=textSigle+%%3D+"%s"', url_encode(enc2utf8(textSigle))))
+ ) %>%
+ mutate(across(everything(), as.character)) %>%
+ relocate(textSigle)
+ } else {
+ res <- lapply(res, function(x) paste0(x, collapse = "\\t")) # flatten list
+ res <- as_tibble(res) %>%
+ head(n = 1) %>%
+ mutate(
+ requestUrl = url,
+ textSigle = textSigle,
+ webUIRequestUrl = paste0(kco@KorAPUrl, sprintf('?q=<base/s=t>&cq=textSigle+%%3D+"%s"', url_encode(enc2utf8(textSigle))))
+ ) %>%
+ relocate(textSigle)
+ }
}
+ res
}
- res
}
-})
-
-
+)
diff --git a/tests/testthat/test-corpusStats-eta.R b/tests/testthat/test-corpusStats-eta.R
index 5a57079..f95ccaa 100644
--- a/tests/testthat/test-corpusStats-eta.R
+++ b/tests/testthat/test-corpusStats-eta.R
@@ -45,7 +45,7 @@
# Test 3: Check that ETA is displayed (format like "ETA: MM:SS" or "ETA: HH:MM:SS")
expect_match(
output_str,
- "ETA: \\d{2}:\\d{2}",
+ "ETA: \\d{2}",
info = "ETA format not found in output"
)