| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 1 | #' Class KorAPCorpusStats |
| 2 | #' |
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 3 | #' `KorAPCorpusStats` objects can hold information about a corpus or virtual corpus. |
| 4 | #' `KorAPCorpusStats` objects can be obtained by the [corpusStats()] method. |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 5 | #' |
| 6 | #' @include KorAPConnection.R |
| Marc Kupietz | 6dfeed9 | 2025-06-03 11:58:06 +0200 | [diff] [blame] | 7 | #' @include logging.R |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 8 | #' |
| 9 | #' @export |
| 10 | #' @slot vc definition of the virtual corpus |
| 11 | #' @slot tokens number of tokens |
| 12 | #' @slot documents number of documents |
| 13 | #' @slot sentences number of sentences |
| 14 | #' @slot paragraphs number of paragraphs |
| Marc Kupietz | 58bffe8 | 2023-11-17 11:48:21 +0100 | [diff] [blame] | 15 | #' @slot webUIRequestUrl link to the web user interface with the current vc definition |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 16 | setClass("KorAPCorpusStats", slots = c(vc = "character", documents = "numeric", tokens = "numeric", sentences = "numeric", paragraphs = "numeric", webUIRequestUrl = "character")) |
| Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 17 | |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 18 | setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats")) |
| Marc Kupietz | 636fd39 | 2019-09-12 17:58:23 +0200 | [diff] [blame] | 19 | |
| 20 | #' Fetch information about a (virtual) corpus |
| Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 21 | #' @param kco [KorAPConnection()] object (obtained e.g. from `KorAPConnection()` |
| Marc Kupietz | 636fd39 | 2019-09-12 17:58:23 +0200 | [diff] [blame] | 22 | #' @param vc string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible. |
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 23 | #' @param verbose logical. If `TRUE`, additional diagnostics are printed. |
| Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 24 | #' @param as.df return result as data frame instead of as S4 object? |
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 25 | #' @return `KorAPCorpusStats` object with the slots `documents`, `tokens`, `sentences`, `paragraphs` |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 26 | #' |
| Marc Kupietz | 1c994ee | 2023-11-19 11:45:30 +0100 | [diff] [blame] | 27 | #' @importFrom urltools url_encode |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 28 | #' @examples |
| Marc Kupietz | 891534e | 2023-04-18 18:23:17 +0200 | [diff] [blame] | 29 | #' \dontrun{ |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 30 | #' |
| Marc Kupietz | 617266d | 2025-02-27 10:43:07 +0100 | [diff] [blame] | 31 | #' kco <- KorAPConnection() |
| Marc Kupietz | 44d66f3 | 2022-11-16 18:42:22 +0100 | [diff] [blame] | 32 | #' corpusStats(kco, "pubDate in 2017 & textType=/Zeitung.*/") |
| Marc Kupietz | 891534e | 2023-04-18 18:23:17 +0200 | [diff] [blame] | 33 | #' } |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 34 | #' |
| 35 | #' @aliases corpusStats |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 36 | |
| Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 37 | #' @export |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 38 | setMethod("corpusStats", "KorAPConnection", function(kco, |
| 39 | vc = "", |
| 40 | verbose = kco@verbose, |
| 41 | as.df = FALSE) { |
| 42 | if (length(vc) > 1) { |
| 43 | # ETA calculation for multiple virtual corpora |
| 44 | total_items <- length(vc) |
| 45 | start_time <- Sys.time() |
| 46 | results <- list() |
| 47 | individual_times <- numeric(total_items) |
| 48 | |
| 49 | for (i in seq_along(vc)) { |
| 50 | current_vc <- vc[i] |
| 51 | item_start_time <- Sys.time() |
| 52 | |
| 53 | # Truncate long vc strings for display |
| 54 | vc_display <- if (nchar(current_vc) > 50) { |
| 55 | paste0(substr(current_vc, 1, 47), "...") |
| 56 | } else { |
| 57 | current_vc |
| 58 | } |
| 59 | |
| 60 | # Process current virtual corpus |
| 61 | result <- corpusStats(kco, current_vc, verbose = FALSE, as.df = TRUE) |
| 62 | results[[i]] <- result |
| 63 | |
| 64 | # Record individual processing time |
| 65 | item_end_time <- Sys.time() |
| 66 | individual_times[i] <- as.numeric(difftime(item_end_time, item_start_time, units = "secs")) |
| 67 | |
| 68 | # Format item number with proper alignment |
| 69 | current_item_formatted <- sprintf(paste0("%", nchar(total_items), "d"), i) |
| 70 | |
| 71 | # Calculate timing and ETA after first few items, using cache-aware approach |
| 72 | if (i >= 2) { |
| Marc Kupietz | e8c8e1a | 2025-06-19 17:37:59 +0200 | [diff] [blame] | 73 | eta_info <- calculate_sophisticated_eta(individual_times, i, total_items) |
| 74 | cache_indicator <- get_cache_indicator(eta_info$is_cached) |
| 75 | eta_display <- format_eta_display(eta_info$eta_seconds, eta_info$estimated_completion_time) |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 76 | |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 77 | log_info(verbose, sprintf( |
| Marc Kupietz | e8c8e1a | 2025-06-19 17:37:59 +0200 | [diff] [blame] | 78 | "Processed vc %s/%d: \"%s\" in %4.1fs%s%s\n", |
| 79 | current_item_formatted, |
| 80 | total_items, |
| 81 | vc_display, |
| 82 | individual_times[i], |
| 83 | cache_indicator, |
| 84 | eta_display |
| 85 | )) |
| 86 | } else { |
| 87 | # First item, show without ETA |
| 88 | cache_indicator <- get_cache_indicator(individual_times[i] < 0.1) |
| 89 | log_info(verbose, sprintf( |
| 90 | "Processed vc %s/%d: \"%s\" in %4.1fs%s\n", |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 91 | current_item_formatted, |
| 92 | total_items, |
| 93 | vc_display, |
| 94 | individual_times[i], |
| 95 | cache_indicator |
| 96 | )) |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | # Final timing summary with cache analysis |
| 101 | if (verbose && total_items > 1) { |
| 102 | total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs")) |
| 103 | avg_time_per_item <- total_time / total_items |
| 104 | cached_count <- sum(individual_times < 0.1) |
| 105 | non_cached_count <- total_items - cached_count |
| 106 | |
| 107 | log_info(verbose, sprintf( |
| 108 | "Completed processing %d virtual corpora in %s (avg: %4.1fs/item, %d cached, %d non-cached)\n", |
| 109 | total_items, |
| 110 | format_duration(total_time), |
| 111 | avg_time_per_item, |
| 112 | cached_count, |
| 113 | non_cached_count |
| 114 | )) |
| 115 | } |
| 116 | |
| 117 | do.call(rbind, results) |
| 118 | } else { |
| Marc Kupietz | 2f4f888 | 2020-01-18 11:02:23 +0100 | [diff] [blame] | 119 | url <- |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 120 | paste0( |
| 121 | kco@apiUrl, |
| 122 | "statistics?cq=", |
| 123 | URLencode(enc2utf8(vc), reserved = TRUE) |
| 124 | ) |
| Marc Kupietz | a47d150 | 2023-04-18 15:26:47 +0200 | [diff] [blame] | 125 | log_info(verbose, "Getting size of virtual corpus \"", vc, "\"", sep = "") |
| Marc Kupietz | 4de53ec | 2019-10-04 09:12:00 +0200 | [diff] [blame] | 126 | res <- apiCall(kco, url) |
| Marc Kupietz | 1c994ee | 2023-11-19 11:45:30 +0100 | [diff] [blame] | 127 | webUIRequestUrl <- paste0(kco@KorAPUrl, sprintf("?q=<base/s=t>&cq=%s", url_encode(enc2utf8(vc)))) |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 128 | if (is.null(res)) { |
| 129 | res <- data.frame(documents = NA, tokens = NA, sentences = NA, paragraphs = NA) |
| Marc Kupietz | a467572 | 2022-02-23 23:55:15 +0100 | [diff] [blame] | 130 | } |
| Marc Kupietz | a47d150 | 2023-04-18 15:26:47 +0200 | [diff] [blame] | 131 | log_info(verbose, ": ", res$tokens, " tokens\n") |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 132 | if (as.df) { |
| Marc Kupietz | 58bffe8 | 2023-11-17 11:48:21 +0100 | [diff] [blame] | 133 | data.frame(vc = vc, webUIRequestUrl = webUIRequestUrl, res, stringsAsFactors = FALSE) |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 134 | } else { |
| Marc Kupietz | 2f4f888 | 2020-01-18 11:02:23 +0100 | [diff] [blame] | 135 | new( |
| 136 | "KorAPCorpusStats", |
| 137 | vc = vc, |
| Marc Kupietz | e5374f2 | 2024-12-16 07:29:52 +0100 | [diff] [blame] | 138 | documents = ifelse(is.logical(res$documents), 0, res$documents), |
| 139 | tokens = ifelse(is.logical(res$tokens), 0, res$tokens), |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 140 | sentences = ifelse(is.logical(res$documents), 0, res$sentences), |
| Marc Kupietz | e5374f2 | 2024-12-16 07:29:52 +0100 | [diff] [blame] | 141 | paragraphs = ifelse(is.logical(res$paragraphs), 0, res$paragraphs), |
| Marc Kupietz | 58bffe8 | 2023-11-17 11:48:21 +0100 | [diff] [blame] | 142 | webUIRequestUrl = webUIRequestUrl |
| Marc Kupietz | 2f4f888 | 2020-01-18 11:02:23 +0100 | [diff] [blame] | 143 | ) |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 144 | } |
| Marc Kupietz | 2f4f888 | 2020-01-18 11:02:23 +0100 | [diff] [blame] | 145 | } |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 146 | }) |
| 147 | |
| 148 | #' @rdname KorAPCorpusStats-class |
| 149 | #' @param object KorAPCorpusStats object |
| 150 | #' @export |
| 151 | setMethod("show", "KorAPCorpusStats", function(object) { |
| 152 | cat("<KorAPCorpusStats>", "\n") |
| 153 | if (object@vc == "") { |
| 154 | cat("The whole corpus") |
| 155 | } else { |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 156 | cat("The virtual corpus described by \"", object@vc, "\"", sep = "") |
| Marc Kupietz | 632cbd4 | 2019-09-06 16:04:51 +0200 | [diff] [blame] | 157 | } |
| Marc Kupietz | 71c30d8 | 2025-05-31 16:27:53 +0200 | [diff] [blame] | 158 | cat( |
| 159 | " contains", formatC(object@tokens, format = "f", digits = 0, big.mark = ","), "tokens in", |
| 160 | formatC(object@sentences, format = "d", big.mark = ","), "sentences in", |
| 161 | formatC(object@documents, format = "d", big.mark = ","), "documents.\n" |
| 162 | ) |
| Marc Kupietz | e95108e | 2019-09-18 13:23:58 +0200 | [diff] [blame] | 163 | }) |