blob: 75bd6eb1f842b2435ee1b10a46e4cb12acb77427 [file] [log] [blame]
#' Class KorAPCorpusStats
#'
#' `KorAPCorpusStats` objects can hold information about a corpus or virtual corpus.
#' `KorAPCorpusStats` objects can be obtained by the [corpusStats()] method.
#'
#' @include KorAPConnection.R
#'
#' @export
#' @slot vc definition of the virtual corpus
#' @slot tokens number of tokens
#' @slot documents number of documents
#' @slot sentences number of sentences
#' @slot paragraphs number of paragraphs
#' @slot webUIRequestUrl link to the web user interface with the current vc definition
setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character"))
log_info <- function(v, ...) {
green <- "\033[32m"
reset <- "\033[0m"
cat(ifelse(v, paste0(green, ..., reset), ""))
}
setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats"))
#' Fetch information about a (virtual) corpus
#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
#' @param vc string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
#' @param verbose logical. If `TRUE`, additional diagnostics are printed.
#' @param as.df return result as data frame instead of as S4 object?
#' @return `KorAPCorpusStats` object with the slots `documents`, `tokens`, `sentences`, `paragraphs`
#'
#' @importFrom urltools url_encode
#' @examples
#' \dontrun{
#'
#' kco <- KorAPConnection()
#' corpusStats(kco, "pubDate in 2017 & textType=/Zeitung.*/")
#' }
#'
#' @aliases corpusStats
# Helper function to format duration with leading zeros
format_duration <- function(seconds) {
if (is.na(seconds) || !is.finite(seconds) || seconds <= 0) {
return("N/A")
}
hours <- floor(seconds / 3600)
minutes <- floor((seconds %% 3600) / 60)
secs <- round(seconds %% 60)
if (hours > 0) {
return(sprintf("%02d:%02d:%02d", hours, minutes, secs))
} else {
return(sprintf("%02d:%02d", minutes, secs))
}
}
#' @export
setMethod("corpusStats", "KorAPConnection", function(kco,
vc = "",
verbose = kco@verbose,
as.df = FALSE) {
if (length(vc) > 1) {
# ETA calculation for multiple virtual corpora
total_items <- length(vc)
start_time <- Sys.time()
results <- list()
individual_times <- numeric(total_items)
for (i in seq_along(vc)) {
current_vc <- vc[i]
item_start_time <- Sys.time()
# Truncate long vc strings for display
vc_display <- if (nchar(current_vc) > 50) {
paste0(substr(current_vc, 1, 47), "...")
} else {
current_vc
}
# Process current virtual corpus
result <- corpusStats(kco, current_vc, verbose = FALSE, as.df = TRUE)
results[[i]] <- result
# Record individual processing time
item_end_time <- Sys.time()
individual_times[i] <- as.numeric(difftime(item_end_time, item_start_time, units = "secs"))
# Format item number with proper alignment
current_item_formatted <- sprintf(paste0("%", nchar(total_items), "d"), i)
# Calculate timing and ETA after first few items, using cache-aware approach
if (i >= 2) {
# Use recent non-cached times for better ETA estimates
# Exclude very fast responses (< 0.1s) as likely cached
non_cached_times <- individual_times[1:i][individual_times[1:i] >= 0.1]
if (length(non_cached_times) >= 1) {
# Use median of recent non-cached times for more stable estimates
recent_window <- min(5, length(non_cached_times))
recent_times <- tail(non_cached_times, recent_window)
time_per_item <- median(recent_times)
remaining_items <- total_items - i
eta_seconds <- time_per_item * remaining_items
estimated_completion_time <- Sys.time() + eta_seconds
# Show current item time and cache status
cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
log_info(verbose, sprintf(
"Processed vc %s/%d: \"%s\" in %4.1fs%s, ETA: %s (%s\n",
current_item_formatted,
total_items,
vc_display,
individual_times[i],
cache_indicator,
format_duration(eta_seconds),
format(estimated_completion_time, "%Y-%m-%d %H:%M:%S)")
))
} else {
# All responses so far appear cached, show without ETA
cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
log_info(verbose, sprintf(
"Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
current_item_formatted,
total_items,
vc_display,
individual_times[i],
cache_indicator
))
}
} else {
# Log first item without ETA
cache_indicator <- if (individual_times[i] < 0.1) " [cached]" else ""
log_info(verbose, sprintf(
"Processed VC %s/%d: \"%s\" (%4.1fs%s)\n",
current_item_formatted,
total_items,
vc_display,
individual_times[i],
cache_indicator
))
}
}
# Final timing summary with cache analysis
if (verbose && total_items > 1) {
total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
avg_time_per_item <- total_time / total_items
cached_count <- sum(individual_times < 0.1)
non_cached_count <- total_items - cached_count
log_info(verbose, sprintf(
"Completed processing %d virtual corpora in %s (avg: %4.1fs/item, %d cached, %d non-cached)\n",
total_items,
format_duration(total_time),
avg_time_per_item,
cached_count,
non_cached_count
))
}
do.call(rbind, results)
} else {
url <-
paste0(
kco@apiUrl,
"statistics?cq=",
URLencode(enc2utf8(vc), reserved = TRUE)
)
log_info(verbose, "Getting size of virtual corpus \"", vc, "\"", sep = "")
res <- apiCall(kco, url)
webUIRequestUrl <- paste0(kco@KorAPUrl, sprintf("?q=<base/s=t>&cq=%s", url_encode(enc2utf8(vc))))
if (is.null(res)) {
res <- data.frame(documents = NA, tokens = NA, sentences = NA, paragraphs = NA)
}
log_info(verbose, ": ", res$tokens, " tokens\n")
if (as.df) {
data.frame(vc = vc, webUIRequestUrl = webUIRequestUrl, res, stringsAsFactors = FALSE)
} else {
new(
"KorAPCorpusStats",
vc = vc,
documents = ifelse(is.logical(res$documents), 0, res$documents),
tokens = ifelse(is.logical(res$tokens), 0, res$tokens),
sentences = ifelse(is.logical(res$documents), 0, res$sentences),
paragraphs = ifelse(is.logical(res$paragraphs), 0, res$paragraphs),
webUIRequestUrl = webUIRequestUrl
)
}
}
})
#' @rdname KorAPCorpusStats-class
#' @param object KorAPCorpusStats object
#' @export
setMethod("show", "KorAPCorpusStats", function(object) {
cat("<KorAPCorpusStats>", "\n")
if (object@vc == "") {
cat("The whole corpus")
} else {
cat("The virtual corpus described by \"", object@vc, "\"", sep = "")
}
cat(
" contains", formatC(object@tokens, format = "f", digits = 0, big.mark = ","), "tokens in",
formatC(object@sentences, format = "d", big.mark = ","), "sentences in",
formatC(object@documents, format = "d", big.mark = ","), "documents.\n"
)
})