blob: 7789945deb59fdfa39c010f7f60411f36e009240 [file] [log] [blame]
Marc Kupietze95108e2019-09-18 13:23:58 +02001#' Class KorAPCorpusStats
2#'
Marc Kupietz67edcb52021-09-20 21:54:24 +02003#' `KorAPCorpusStats` objects can hold information about a corpus or virtual corpus.
4#' `KorAPCorpusStats` objects can be obtained by the [corpusStats()] method.
Marc Kupietze95108e2019-09-18 13:23:58 +02005#'
6#' @include KorAPConnection.R
Marc Kupietz6dfeed92025-06-03 11:58:06 +02007#' @include logging.R
Marc Kupietze95108e2019-09-18 13:23:58 +02008#'
9#' @export
10#' @slot vc definition of the virtual corpus
11#' @slot tokens number of tokens
12#' @slot documents number of documents
13#' @slot sentences number of sentences
14#' @slot paragraphs number of paragraphs
Marc Kupietz58bffe82023-11-17 11:48:21 +010015#' @slot webUIRequestUrl link to the web user interface with the current vc definition
Marc Kupietz71c30d82025-05-31 16:27:53 +020016setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character"))
Marc Kupietz632cbd42019-09-06 16:04:51 +020017
Marc Kupietz71c30d82025-05-31 16:27:53 +020018setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats"))
Marc Kupietz636fd392019-09-12 17:58:23 +020019
20#' Fetch information about a (virtual) corpus
Marc Kupietz617266d2025-02-27 10:43:07 +010021#' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()`
Marc Kupietz636fd392019-09-12 17:58:23 +020022#' @param vc string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
Marc Kupietz67edcb52021-09-20 21:54:24 +020023#' @param verbose logical. If `TRUE`, additional diagnostics are printed.
Marc Kupietz4de53ec2019-10-04 09:12:00 +020024#' @param as.df return result as data frame instead of as S4 object?
Marc Kupietz67edcb52021-09-20 21:54:24 +020025#' @return `KorAPCorpusStats` object with the slots `documents`, `tokens`, `sentences`, `paragraphs`
Marc Kupietze95108e2019-09-18 13:23:58 +020026#'
Marc Kupietz1c994ee2023-11-19 11:45:30 +010027#' @importFrom urltools url_encode
Marc Kupietze95108e2019-09-18 13:23:58 +020028#' @examples
Marc Kupietz891534e2023-04-18 18:23:17 +020029#' \dontrun{
Marc Kupietze95108e2019-09-18 13:23:58 +020030#'
Marc Kupietz617266d2025-02-27 10:43:07 +010031#' kco <- KorAPConnection()
Marc Kupietz44d66f32022-11-16 18:42:22 +010032#' corpusStats(kco, "pubDate in 2017 & textType=/Zeitung.*/")
Marc Kupietz891534e2023-04-18 18:23:17 +020033#' }
Marc Kupietze95108e2019-09-18 13:23:58 +020034#'
35#' @aliases corpusStats
Marc Kupietz71c30d82025-05-31 16:27:53 +020036
Marc Kupietz632cbd42019-09-06 16:04:51 +020037#' @export
Marc Kupietz71c30d82025-05-31 16:27:53 +020038setMethod("corpusStats", "KorAPConnection", function(kco,
39 vc = "",
40 verbose = kco@verbose,
41 as.df = FALSE) {
42 if (length(vc) > 1) {
43 # ETA calculation for multiple virtual corpora
44 total_items <- length(vc)
45 start_time <- Sys.time()
46 results <- list()
47 individual_times <- numeric(total_items)
48
49 for (i in seq_along(vc)) {
50 current_vc <- vc[i]
51 item_start_time <- Sys.time()
52
53 # Truncate long vc strings for display
54 vc_display <- if (nchar(current_vc) > 50) {
55 paste0(substr(current_vc, 1, 47), "...")
56 } else {
57 current_vc
58 }
59
60 # Process current virtual corpus
61 result <- corpusStats(kco, current_vc, verbose = FALSE, as.df = TRUE)
62 results[[i]] <- result
63
64 # Record individual processing time
65 item_end_time <- Sys.time()
66 individual_times[i] <- as.numeric(difftime(item_end_time, item_start_time, units = "secs"))
67
68 # Format item number with proper alignment
69 current_item_formatted <- sprintf(paste0("%", nchar(total_items), "d"), i)
70
71 # Calculate timing and ETA after first few items, using cache-aware approach
72 if (i >= 2) {
Marc Kupietze8c8e1a2025-06-19 17:37:59 +020073 eta_info <- calculate_sophisticated_eta(individual_times, i, total_items)
74 cache_indicator <- get_cache_indicator(eta_info$is_cached)
75 eta_display <- format_eta_display(eta_info$eta_seconds, eta_info$estimated_completion_time)
Marc Kupietz71c30d82025-05-31 16:27:53 +020076
Marc Kupietz71c30d82025-05-31 16:27:53 +020077 log_info(verbose, sprintf(
Marc Kupietze8c8e1a2025-06-19 17:37:59 +020078 "Processed vc %s/%d: \"%s\" in %4.1fs%s%s\n",
79 current_item_formatted,
80 total_items,
81 vc_display,
82 individual_times[i],
83 cache_indicator,
84 eta_display
85 ))
86 } else {
87 # First item, show without ETA
88 cache_indicator <- get_cache_indicator(individual_times[i] < 0.1)
89 log_info(verbose, sprintf(
90 "Processed vc %s/%d: \"%s\" in %4.1fs%s\n",
Marc Kupietz71c30d82025-05-31 16:27:53 +020091 current_item_formatted,
92 total_items,
93 vc_display,
94 individual_times[i],
95 cache_indicator
96 ))
97 }
98 }
99
100 # Final timing summary with cache analysis
101 if (verbose && total_items > 1) {
102 total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
103 avg_time_per_item <- total_time / total_items
104 cached_count <- sum(individual_times < 0.1)
105 non_cached_count <- total_items - cached_count
106
107 log_info(verbose, sprintf(
108 "Completed processing %d virtual corpora in %s (avg: %4.1fs/item, %d cached, %d non-cached)\n",
109 total_items,
110 format_duration(total_time),
111 avg_time_per_item,
112 cached_count,
113 non_cached_count
114 ))
115 }
116
117 do.call(rbind, results)
118 } else {
Marc Kupietz2f4f8882020-01-18 11:02:23 +0100119 url <-
Marc Kupietz71c30d82025-05-31 16:27:53 +0200120 paste0(
121 kco@apiUrl,
122 "statistics?cq=",
123 URLencode(enc2utf8(vc), reserved = TRUE)
124 )
Marc Kupietza47d1502023-04-18 15:26:47 +0200125 log_info(verbose, "Getting size of virtual corpus \"", vc, "\"", sep = "")
Marc Kupietz4de53ec2019-10-04 09:12:00 +0200126 res <- apiCall(kco, url)
Marc Kupietz1c994ee2023-11-19 11:45:30 +0100127 webUIRequestUrl <- paste0(kco@KorAPUrl, sprintf("?q=<base/s=t>&cq=%s", url_encode(enc2utf8(vc))))
Marc Kupietz71c30d82025-05-31 16:27:53 +0200128 if (is.null(res)) {
129 res <- data.frame(documents = NA, tokens = NA, sentences = NA, paragraphs = NA)
Marc Kupietza4675722022-02-23 23:55:15 +0100130 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200131 log_info(verbose, ": ", res$tokens, " tokens\n")
Marc Kupietz71c30d82025-05-31 16:27:53 +0200132 if (as.df) {
Marc Kupietz58bffe82023-11-17 11:48:21 +0100133 data.frame(vc = vc, webUIRequestUrl = webUIRequestUrl, res, stringsAsFactors = FALSE)
Marc Kupietz71c30d82025-05-31 16:27:53 +0200134 } else {
Marc Kupietz2f4f8882020-01-18 11:02:23 +0100135 new(
136 "KorAPCorpusStats",
137 vc = vc,
Marc Kupietze5374f22024-12-16 07:29:52 +0100138 documents = ifelse(is.logical(res$documents), 0, res$documents),
139 tokens = ifelse(is.logical(res$tokens), 0, res$tokens),
Marc Kupietz71c30d82025-05-31 16:27:53 +0200140 sentences = ifelse(is.logical(res$documents), 0, res$sentences),
Marc Kupietze5374f22024-12-16 07:29:52 +0100141 paragraphs = ifelse(is.logical(res$paragraphs), 0, res$paragraphs),
Marc Kupietz58bffe82023-11-17 11:48:21 +0100142 webUIRequestUrl = webUIRequestUrl
Marc Kupietz2f4f8882020-01-18 11:02:23 +0100143 )
Marc Kupietz71c30d82025-05-31 16:27:53 +0200144 }
Marc Kupietz2f4f8882020-01-18 11:02:23 +0100145 }
Marc Kupietze95108e2019-09-18 13:23:58 +0200146})
147
148#' @rdname KorAPCorpusStats-class
149#' @param object KorAPCorpusStats object
150#' @export
151setMethod("show", "KorAPCorpusStats", function(object) {
152 cat("<KorAPCorpusStats>", "\n")
153 if (object@vc == "") {
154 cat("The whole corpus")
155 } else {
Marc Kupietz71c30d82025-05-31 16:27:53 +0200156 cat("The virtual corpus described by \"", object@vc, "\"", sep = "")
Marc Kupietz632cbd42019-09-06 16:04:51 +0200157 }
Marc Kupietz71c30d82025-05-31 16:27:53 +0200158 cat(
159 " contains", formatC(object@tokens, format = "f", digits = 0, big.mark = ","), "tokens in",
160 formatC(object@sentences, format = "d", big.mark = ","), "sentences in",
161 formatC(object@documents, format = "d", big.mark = ","), "documents.\n"
162 )
Marc Kupietze95108e2019-09-18 13:23:58 +0200163})