blob: 9446a4bc8cfa04618533d6e044c081bfd500755f [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
49#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
50#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020051#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020052#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020053#' @importFrom tidyr expand_grid pivot_wider
54#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#'
56#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020059#' # Find top collocates of "Packung" inside and outside the sports domain.
60#' KorAPConnection(verbose = TRUE) |>
61#' collocationAnalysis("Packung",
62#' vc = c("textClass=sport", "textClass!=sport"),
63#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
64#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' dplyr::filter(logDice >= 5)
66#' }
67#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020068#' \dontrun{
69#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' # Identify the most prominent light verb construction with "in ... setzen".
71#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010072#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020074#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
75#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020076#' }
77#'
78#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020079setMethod(
80 "collocationAnalysis", "KorAPConnection",
81 function(kco,
82 node,
83 vc = "",
84 lemmatizeNodeQuery = FALSE,
85 minOccur = 5,
86 leftContextSize = 5,
87 rightContextSize = 5,
88 topCollocatesLimit = 200,
89 searchHitsSampleLimit = 20000,
90 ignoreCollocateCase = FALSE,
91 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
92 exactFrequencies = TRUE,
93 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
94 seed = 7,
95 expand = length(vc) != length(node),
96 maxRecurse = 0,
97 addExamples = FALSE,
98 thresholdScore = "logDice",
99 threshold = 2.0,
100 localStopwords = c(),
101 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200102 multiVcMissingScoreFactor = 0.9,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200103 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200104 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200105 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200106
Marc Kupietzb2862d42025-10-18 10:17:49 +0200107 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200108 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
109 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200110
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200111 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200112
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200113 if (lemmatizeNodeQuery) {
114 node <- lemmatizeWordQuery(node)
115 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200116
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200118 if (is.null(vcNames)) {
119 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200120 }
121
122 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200123 if (!is.null(names(vc)) && length(vc) > 0) {
124 raw_names <- names(vc)
125 if (any(!is.na(raw_names) & raw_names != "")) {
126 label_lookup <- stats::setNames(raw_names, vc)
127 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200128 }
129
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200130 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200132 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
133 tmp_grid$vc <- vc[tmp_grid$idx]
134 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
135 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200136 } else {
137 tibble(node = node, vc = vc, vcLabel = vcNames)
138 }
139
140 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200141 collocationAnalysis(kco,
142 node = node,
143 vc = vc,
144 minOccur = minOccur,
145 leftContextSize = leftContextSize,
146 rightContextSize = rightContextSize,
147 topCollocatesLimit = topCollocatesLimit,
148 searchHitsSampleLimit = searchHitsSampleLimit,
149 ignoreCollocateCase = ignoreCollocateCase,
150 withinSpan = withinSpan,
151 exactFrequencies = exactFrequencies,
152 stopwords = stopwords,
153 addExamples = TRUE,
154 localStopwords = localStopwords,
155 seed = seed,
156 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200157 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200158 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200159 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200160 ...
161 )
162 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200163 bind_rows()
164
165 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
166 multi_result
167 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200168 if (!"label" %in% names(multi_result)) {
169 multi_result$label <- NA_character_
170 }
171
172 if (!is.null(label_lookup)) {
173 override <- unname(label_lookup[multi_result$vc])
174 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
175 if (any(missing_idx)) {
176 multi_result$label[missing_idx] <- override[missing_idx]
177 }
178 }
179
180 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
181 if (any(missing_idx)) {
182 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
183 }
184
Marc Kupietze31322e2025-10-17 18:55:36 +0200185 multi_result |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200186 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
187 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200188 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200189 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
190 vcLabel <- vcNames[1]
191 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200192
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200193 set.seed(seed)
194 candidates <- collocatesQuery(
195 kco,
196 node,
197 vc = vc,
198 minOccur = minOccur,
199 leftContextSize = leftContextSize,
200 rightContextSize = rightContextSize,
201 searchHitsSampleLimit = searchHitsSampleLimit,
202 ignoreCollocateCase = ignoreCollocateCase,
203 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200204 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200205 ...
206 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200207
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200208 if (nrow(candidates) > 0) {
209 candidates <- candidates |>
210 filter(frequency >= minOccur) |>
211 slice_head(n = topCollocatesLimit)
212 collocationScoreQuery(
213 kco,
214 node = node,
215 collocate = candidates$word,
216 vc = vc,
217 leftContextSize = leftContextSize,
218 rightContextSize = rightContextSize,
219 observed = if (exactFrequencies) NA else candidates$frequency,
220 ignoreCollocateCase = ignoreCollocateCase,
221 withinSpan = withinSpan,
222 ...
223 ) |>
224 filter(O >= minOccur) |>
225 dplyr::arrange(dplyr::desc(logDice))
226 } else {
227 tibble()
228 }
229 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200230
231 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
232 result$label <- rep(vcLabel, nrow(result))
233 }
234
235 threshold_col <- thresholdScore
236 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
237 threshold_values <- result[[threshold_col]]
238 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
239 if (length(eligible_idx) > 0) {
240 recurseWith <- result[eligible_idx, , drop = FALSE]
241 result <- collocationAnalysis(
242 kco,
243 node = paste0("(", buildCollocationQuery(
244 removeWithinSpan(recurseWith$node, withinSpan),
245 recurseWith$collocate,
246 leftContextSize = leftContextSize,
247 rightContextSize = rightContextSize,
248 withinSpan = ""
249 ), ")"),
250 vc = vc,
251 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200252 leftContextSize = leftContextSize,
253 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200254 withinSpan = withinSpan,
255 maxRecurse = maxRecurse - 1,
256 stopwords = stopwords,
257 localStopwords = recurseWith$collocate,
258 exactFrequencies = exactFrequencies,
259 searchHitsSampleLimit = searchHitsSampleLimit,
260 topCollocatesLimit = topCollocatesLimit,
261 addExamples = FALSE,
262 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
263 collocateFilterRegex = collocateFilterRegex,
264 vcLabel = vcLabel
265 ) |>
266 bind_rows(result) |>
267 filter(logDice >= 2) |>
268 filter(O >= minOccur) |>
269 dplyr::arrange(dplyr::desc(logDice))
270 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200271 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200272
273 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200274 result$query <- buildCollocationQuery(
275 result$node,
276 result$collocate,
277 leftContextSize = leftContextSize,
278 rightContextSize = rightContextSize,
279 withinSpan = withinSpan
280 )
281 result$example <- findExample(
282 kco,
283 query = result$query,
284 vc = result$vc
285 )
286 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200287
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200288 result
289 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200290)
291
Marc Kupietz76b05592021-12-19 16:26:15 +0100292# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100293removeWithinSpan <- function(query, withinSpan) {
294 if (withinSpan == "") {
295 return(query)
296 }
297 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200298 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100299 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200300 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100301 return(res)
302}
303
Marc Kupietzc4540a22025-10-14 17:39:53 +0200304add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200305 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200306
307 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
308 return(result)
309 }
310
311 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
312 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
313 score_cols <- setdiff(numeric_cols, non_score_cols)
314
315 if (length(score_cols) == 0) {
316 return(result)
317 }
318
Marc Kupietzc4540a22025-10-14 17:39:53 +0200319 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200320 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
321 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200322 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200323 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200324 names_glue = "{.value}_{make.names(label)}",
325 values_fn = dplyr::first
326 )
327
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200328 raw_labels <- unique(result$label)
329 labels <- make.names(raw_labels)
330 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200331
Marc Kupietz28a29842025-10-18 12:25:09 +0200332 rank_data <- result |>
333 dplyr::distinct(node, collocate)
334
335 for (i in seq_along(raw_labels)) {
336 raw_lab <- raw_labels[i]
337 safe_lab <- labels[i]
338 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
339 if (nrow(label_df) == 0) {
340 next
341 }
342 label_df <- dplyr::distinct(label_df)
343 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
344 for (col in score_cols) {
345 rank_col_name <- paste0("rank_", safe_lab, "_", col)
346 values <- label_df[[col]]
347 ranks <- rep(NA_real_, length(values))
348 valid_idx <- which(!is.na(values))
349 if (length(valid_idx) > 0) {
350 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
351 }
352 rank_tbl[[rank_col_name]] <- ranks
353 }
354 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
355 }
356
357 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
358
359 rank_replacements <- numeric(0)
360 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
361 if (length(rank_column_names) > 0) {
362 rank_replacements <- stats::setNames(
363 vapply(rank_column_names, function(col) {
364 col_values <- comparison[[col]]
365 valid_values <- col_values[!is.na(col_values)]
366 if (length(valid_values) == 0) {
367 nrow(comparison) + 1
368 } else {
369 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
370 }
371 }, numeric(1)),
372 rank_column_names
373 )
374 }
375
376 collapse_label_values <- function(indices, safe_labels_vec) {
377 if (length(indices) == 0) {
378 return(NA_character_)
379 }
380 labs <- label_map[safe_labels_vec[indices]]
381 fallback <- safe_labels_vec[indices]
382 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
383 labs <- labs[!is.na(labs) & labs != ""]
384 if (length(labs) == 0) {
385 return(NA_character_)
386 }
387 paste(unique(labs), collapse = ", ")
388 }
389
Marc Kupietzc4540a22025-10-14 17:39:53 +0200390 if (length(labels) == 2) {
391 fill_scores <- function(x, y) {
392 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
393 if (!is.finite(min_val)) {
394 min_val <- 0
395 }
396 x[is.na(x)] <- missingScoreFactor * min_val
397 y[is.na(y)] <- missingScoreFactor * min_val
398 list(x = x, y = y)
399 }
400
Marc Kupietz28a29842025-10-18 12:25:09 +0200401 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
402 fallback <- nrow(comparison) + 1
403 replacement_left <- rank_replacements[[left_rank_col]]
404 if (is.null(replacement_left) || !is.finite(replacement_left)) {
405 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200406 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200407 replacement_right <- rank_replacements[[right_rank_col]]
408 if (is.null(replacement_right) || !is.finite(replacement_right)) {
409 replacement_right <- fallback
410 }
411 if (any(is.na(x))) {
412 x[is.na(x)] <- replacement_left
413 }
414 if (any(is.na(y))) {
415 y[is.na(y)] <- replacement_right
416 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200417 list(x = x, y = y)
418 }
419
420 left_label <- labels[1]
421 right_label <- labels[2]
422
423 for (col in score_cols) {
424 left_col <- paste0(col, "_", left_label)
425 right_col <- paste0(col, "_", right_label)
426 if (!all(c(left_col, right_col) %in% names(comparison))) {
427 next
428 }
429 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200430 comparison[[left_col]] <- filled$x
431 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200432 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200433 rank_left <- paste0("rank_", left_label, "_", col)
434 rank_right <- paste0("rank_", right_label, "_", col)
435 if (all(c(rank_left, rank_right) %in% names(comparison))) {
436 filled_rank <- fill_ranks(
437 comparison[[rank_left]],
438 comparison[[rank_right]],
439 rank_left,
440 rank_right
441 )
442 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
443 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200444 }
445 }
446
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200447 for (col in score_cols) {
448 value_cols <- paste0(col, "_", labels)
449 existing <- value_cols %in% names(comparison)
450 if (!any(existing)) {
451 next
452 }
453 value_cols <- value_cols[existing]
454 safe_labels <- labels[existing]
455
456 score_values <- comparison[, value_cols, drop = FALSE]
457
458 winner_label_col <- paste0("winner_", col)
459 winner_value_col <- paste0("winner_", col, "_value")
460 runner_label_col <- paste0("runner_up_", col)
461 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200462 loser_label_col <- paste0("loser_", col)
463 loser_value_col <- paste0("loser_", col, "_value")
464 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200465
466 if (nrow(score_values) == 0) {
467 comparison[[winner_label_col]] <- character(0)
468 comparison[[winner_value_col]] <- numeric(0)
469 comparison[[runner_label_col]] <- character(0)
470 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200471 comparison[[loser_label_col]] <- character(0)
472 comparison[[loser_value_col]] <- numeric(0)
473 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200474 next
475 }
476
477 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200478 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200479
Marc Kupietzb2862d42025-10-18 10:17:49 +0200480 n_rows <- nrow(score_matrix)
481 winner_labels <- rep(NA_character_, n_rows)
482 winner_values <- rep(NA_real_, n_rows)
483 runner_labels <- rep(NA_character_, n_rows)
484 runner_values <- rep(NA_real_, n_rows)
485 loser_labels <- rep(NA_character_, n_rows)
486 loser_values <- rep(NA_real_, n_rows)
487 max_deltas <- rep(NA_real_, n_rows)
488
Marc Kupietzb2862d42025-10-18 10:17:49 +0200489 if (n_rows > 0) {
490 for (i in seq_len(n_rows)) {
491 numeric_row <- as.numeric(score_matrix[i, ])
492 if (all(is.na(numeric_row))) {
493 next
494 }
495
496 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
497 if (!is.finite(min_val)) {
498 min_val <- 0
499 }
500 numeric_row[is.na(numeric_row)] <- missingScoreFactor * min_val
501 score_matrix[i, ] <- numeric_row
502
503 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
504 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200505 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200506 winner_values[i] <- max_val
507
508 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
509 if (length(unique_vals) >= 2) {
510 runner_val <- unique_vals[2]
511 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200512 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200513 runner_values[i] <- runner_val
514 }
515
516 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
517 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200518 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200519 loser_values[i] <- min_val
520
521 if (is.finite(max_val) && is.finite(min_val)) {
522 max_deltas[i] <- max_val - min_val
523 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200524 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200525 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200526
Marc Kupietzb2862d42025-10-18 10:17:49 +0200527 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200528 comparison[[winner_label_col]] <- winner_labels
529 comparison[[winner_value_col]] <- winner_values
530 comparison[[runner_label_col]] <- runner_labels
531 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200532 comparison[[loser_label_col]] <- loser_labels
533 comparison[[loser_value_col]] <- loser_values
534 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200535 }
536
Marc Kupietz28a29842025-10-18 12:25:09 +0200537 for (col in score_cols) {
538 rank_cols <- paste0("rank_", labels, "_", col)
539 existing <- rank_cols %in% names(comparison)
540 if (!any(existing)) {
541 next
542 }
543 rank_cols <- rank_cols[existing]
544 safe_labels <- labels[existing]
545 rank_values <- comparison[, rank_cols, drop = FALSE]
546
547 winner_rank_label_col <- paste0("winner_rank_", col)
548 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
549 runner_rank_label_col <- paste0("runner_up_rank_", col)
550 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
551 loser_rank_label_col <- paste0("loser_rank_", col)
552 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
553 max_delta_rank_col <- paste0("max_delta_rank_", col)
554
555 if (nrow(rank_values) == 0) {
556 comparison[[winner_rank_label_col]] <- character(0)
557 comparison[[winner_rank_value_col]] <- numeric(0)
558 comparison[[runner_rank_label_col]] <- character(0)
559 comparison[[runner_rank_value_col]] <- numeric(0)
560 comparison[[loser_rank_label_col]] <- character(0)
561 comparison[[loser_rank_value_col]] <- numeric(0)
562 comparison[[max_delta_rank_col]] <- numeric(0)
563 next
564 }
565
566 rank_matrix <- as.matrix(rank_values)
567 storage.mode(rank_matrix) <- "numeric"
568
569 n_rows <- nrow(rank_matrix)
570 winner_labels <- rep(NA_character_, n_rows)
571 winner_values <- rep(NA_real_, n_rows)
572 runner_labels <- rep(NA_character_, n_rows)
573 runner_values <- rep(NA_real_, n_rows)
574 loser_labels <- rep(NA_character_, n_rows)
575 loser_values <- rep(NA_real_, n_rows)
576 max_deltas <- rep(NA_real_, n_rows)
577
578 for (i in seq_len(n_rows)) {
579 numeric_row <- as.numeric(rank_matrix[i, ])
580 if (all(is.na(numeric_row))) {
581 next
582 }
583
584 if (length(rank_cols) > 0) {
585 replacement_vec <- rank_replacements[rank_cols]
586 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
587 missing_idx <- which(is.na(numeric_row))
588 if (length(missing_idx) > 0) {
589 numeric_row[missing_idx] <- replacement_vec[missing_idx]
590 }
591 }
592
593 valid_idx <- seq_along(numeric_row)
594 valid_values <- numeric_row[valid_idx]
595 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
596 min_positions <- valid_idx[which(valid_values == min_val)]
597 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
598 winner_values[i] <- min_val
599
600 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
601 if (length(ordered_vals) >= 2) {
602 runner_val <- ordered_vals[2]
603 runner_positions <- valid_idx[which(valid_values == runner_val)]
604 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
605 runner_values[i] <- runner_val
606 }
607
608 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
609 max_positions <- valid_idx[which(valid_values == max_val)]
610 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
611 loser_values[i] <- max_val
612
613 if (is.finite(max_val) && is.finite(min_val)) {
614 max_deltas[i] <- max_val - min_val
615 }
616 }
617
618 comparison[[winner_rank_label_col]] <- winner_labels
619 comparison[[winner_rank_value_col]] <- winner_values
620 comparison[[runner_rank_label_col]] <- runner_labels
621 comparison[[runner_rank_value_col]] <- runner_values
622 comparison[[loser_rank_label_col]] <- loser_labels
623 comparison[[loser_rank_value_col]] <- loser_values
624 comparison[[max_delta_rank_col]] <- max_deltas
625 }
626
Marc Kupietzc4540a22025-10-14 17:39:53 +0200627 dplyr::left_join(result, comparison, by = c("node", "collocate"))
628}
629
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200630#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200631#' @importFrom stringr str_detect
632#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
633#'
634matches2FreqTable <- function(matches,
635 index = 0,
636 minOccur = 5,
637 leftContextSize = 5,
638 rightContextSize = 5,
639 ignoreCollocateCase = FALSE,
640 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200641 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200642 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
643 verbose = TRUE) {
644 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
645 frequency <- NULL
646
647 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200648 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200649 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200650 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200651 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200652 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
653 ignoreCollocateCase = ignoreCollocateCase,
654 stopwords = stopwords, oldTable = oldTable, verbose = verbose
655 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200656 }
657 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200658 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200659 oldTable <- matches2FreqTable(
660 matches,
661 i,
662 leftContextSize = leftContextSize,
663 rightContextSize = rightContextSize,
664 collocateFilterRegex = collocateFilterRegex,
665 oldTable = oldTable,
666 stopwords = stopwords
667 )
668 }
669 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200670 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100671 group_by(word) |>
672 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200673 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200674 arrange(desc(frequency))
675 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200676 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200677
678 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
679
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200680 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200681
682 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
683
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200684 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200685
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200686 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200687 oldTable
688 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100689 table(c(left, right)) |>
690 dplyr::as_tibble(.name_repair = "minimal") |>
691 dplyr::rename(word = 1, frequency = 2) |>
692 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200693 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200694 dplyr::bind_rows(oldTable)
695 }
696 }
697}
698
699#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200700#' @importFrom stringr str_match str_split str_detect
701#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
702#'
703snippet2FreqTable <- function(snippet,
704 minOccur = 5,
705 leftContextSize = 5,
706 rightContextSize = 5,
707 ignoreCollocateCase = FALSE,
708 stopwords = c(),
709 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200710 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200711 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
712 verbose = TRUE) {
713 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
714 frequency <- NULL
715
716 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200717 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200718 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200719 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200720 for (s in snippet) {
721 oldTable <- snippet2FreqTable(
722 s,
723 leftContextSize = leftContextSize,
724 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100725 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200726 oldTable = oldTable,
727 stopwords = stopwords
728 )
729 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200730 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200731 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100732 group_by(word) |>
733 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200734 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200735 arrange(desc(frequency))
736 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200737 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200738 match <-
739 str_match(
740 snippet,
741 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
742 )
743
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200744 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200745 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200746 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200747 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200748 }
749 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200750
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200751 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200752 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200753 } else {
754 ""
755 }
756 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200757
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200758 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200759 oldTable
760 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100761 table(c(left, right)) |>
762 dplyr::as_tibble(.name_repair = "minimal") |>
763 dplyr::rename(word = 1, frequency = 2) |>
764 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200765 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200766 dplyr::bind_rows(oldTable)
767 }
768 }
769}
770
771#' Preliminary synsemantic stopwords function
772#'
773#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200774#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200775#'
776#' Preliminary synsemantic stopwords function to be used in collocation analysis.
777#'
778#' @details
779#' Currently only suitable for German. See stopwords package for other languages.
780#'
781#' @param ... future arguments for language detection
782#'
783#' @family collocation analysis functions
784#' @return Vector of synsemantic stopwords.
785#' @export
786synsemanticStopwords <- function(...) {
787 res <- c(
788 "der",
789 "die",
790 "und",
791 "in",
792 "den",
793 "von",
794 "mit",
795 "das",
796 "zu",
797 "im",
798 "ist",
799 "auf",
800 "sich",
801 "Die",
802 "des",
803 "dem",
804 "nicht",
805 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100806 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200807 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100808 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200809 "es",
810 "auch",
811 "an",
812 "als",
813 "am",
814 "aus",
815 "Der",
816 "bei",
817 "er",
818 "dass",
819 "sie",
820 "nach",
821 "um",
822 "Das",
823 "zum",
824 "noch",
825 "war",
826 "einen",
827 "einer",
828 "wie",
829 "einem",
830 "vor",
831 "bis",
832 "\u00fcber",
833 "so",
834 "aber",
835 "Eine",
836 "diese",
837 "Diese",
838 "oder"
839 )
840 return(res)
841}
842
Marc Kupietz5a336b62021-11-27 17:51:35 +0100843
Marc Kupietz76b05592021-12-19 16:26:15 +0100844# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100845findExample <-
846 function(kco,
847 query,
848 vc = "",
849 matchOnly = TRUE) {
850 out <- character(length = length(query))
851
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200852 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100853 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200854 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100855
856 for (i in seq_along(query)) {
857 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100858 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200859 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100860 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200861 out[i] <- if (matchOnly) {
862 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100863 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200864 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100865 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100866 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200867 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100868 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100869 }
870 out
871 }
872
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200873collocatesQuery <-
874 function(kco,
875 query,
876 vc = "",
877 minOccur = 5,
878 leftContextSize = 5,
879 rightContextSize = 5,
880 searchHitsSampleLimit = 20000,
881 ignoreCollocateCase = FALSE,
882 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200883 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200884 ...) {
885 frequency <- NULL
886 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200887 if (q@totalResults == 0) {
888 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200889 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200890 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
891 matches2FreqTable(q@collectedMatches,
892 0,
893 minOccur = minOccur,
894 leftContextSize = leftContextSize,
895 rightContextSize = rightContextSize,
896 ignoreCollocateCase = ignoreCollocateCase,
897 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200898 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200899 ...,
900 verbose = kco@verbose
901 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100902 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200903 filter(frequency >= minOccur)
904 }
905 }