blob: 67a25cbbcfb62bc293aad7776b75a772b14417cd [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz9894a372025-10-18 14:51:29 +020045#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
49#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
50#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020051#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020052#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020053#' @importFrom tidyr expand_grid pivot_wider
54#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#'
56#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020059#' # Find top collocates of "Packung" inside and outside the sports domain.
60#' KorAPConnection(verbose = TRUE) |>
61#' collocationAnalysis("Packung",
62#' vc = c("textClass=sport", "textClass!=sport"),
63#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
64#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' dplyr::filter(logDice >= 5)
66#' }
67#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020068#' \dontrun{
69#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' # Identify the most prominent light verb construction with "in ... setzen".
71#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010072#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020074#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
75#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020076#' }
77#'
78#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020079setMethod(
80 "collocationAnalysis", "KorAPConnection",
81 function(kco,
82 node,
83 vc = "",
84 lemmatizeNodeQuery = FALSE,
85 minOccur = 5,
86 leftContextSize = 5,
87 rightContextSize = 5,
88 topCollocatesLimit = 200,
89 searchHitsSampleLimit = 20000,
90 ignoreCollocateCase = FALSE,
91 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
92 exactFrequencies = TRUE,
93 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
94 seed = 7,
95 expand = length(vc) != length(node),
96 maxRecurse = 0,
97 addExamples = FALSE,
98 thresholdScore = "logDice",
99 threshold = 2.0,
100 localStopwords = c(),
101 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz9894a372025-10-18 14:51:29 +0200102 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200103 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200104 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200105 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200106
Marc Kupietzb2862d42025-10-18 10:17:49 +0200107 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200108 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
109 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200110
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200111 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200112
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200113 if (lemmatizeNodeQuery) {
114 node <- lemmatizeWordQuery(node)
115 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200116
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200118 if (is.null(vcNames)) {
119 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200120 }
121
122 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200123 if (!is.null(names(vc)) && length(vc) > 0) {
124 raw_names <- names(vc)
125 if (any(!is.na(raw_names) & raw_names != "")) {
126 label_lookup <- stats::setNames(raw_names, vc)
127 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200128 }
129
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200130 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200132 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
133 tmp_grid$vc <- vc[tmp_grid$idx]
134 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
135 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200136 } else {
137 tibble(node = node, vc = vc, vcLabel = vcNames)
138 }
139
140 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200141 collocationAnalysis(kco,
142 node = node,
143 vc = vc,
144 minOccur = minOccur,
145 leftContextSize = leftContextSize,
146 rightContextSize = rightContextSize,
147 topCollocatesLimit = topCollocatesLimit,
148 searchHitsSampleLimit = searchHitsSampleLimit,
149 ignoreCollocateCase = ignoreCollocateCase,
150 withinSpan = withinSpan,
151 exactFrequencies = exactFrequencies,
152 stopwords = stopwords,
153 addExamples = TRUE,
154 localStopwords = localStopwords,
155 seed = seed,
156 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200157 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200158 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200159 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200160 ...
161 )
162 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200163 bind_rows()
164
165 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
166 multi_result
167 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200168 if (!"label" %in% names(multi_result)) {
169 multi_result$label <- NA_character_
170 }
171
172 if (!is.null(label_lookup)) {
173 override <- unname(label_lookup[multi_result$vc])
174 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
175 if (any(missing_idx)) {
176 multi_result$label[missing_idx] <- override[missing_idx]
177 }
178 }
179
180 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
181 if (any(missing_idx)) {
182 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
183 }
184
Marc Kupietze31322e2025-10-17 18:55:36 +0200185 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200186 add_multi_vc_comparisons(
187 thresholdScore = thresholdScore,
188 missingScoreQuantile = missingScoreQuantile
189 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200190 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200191 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200192 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
193 vcLabel <- vcNames[1]
194 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200195
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200196 set.seed(seed)
197 candidates <- collocatesQuery(
198 kco,
199 node,
200 vc = vc,
201 minOccur = minOccur,
202 leftContextSize = leftContextSize,
203 rightContextSize = rightContextSize,
204 searchHitsSampleLimit = searchHitsSampleLimit,
205 ignoreCollocateCase = ignoreCollocateCase,
206 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200207 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200208 ...
209 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200210
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200211 if (nrow(candidates) > 0) {
212 candidates <- candidates |>
213 filter(frequency >= minOccur) |>
214 slice_head(n = topCollocatesLimit)
215 collocationScoreQuery(
216 kco,
217 node = node,
218 collocate = candidates$word,
219 vc = vc,
220 leftContextSize = leftContextSize,
221 rightContextSize = rightContextSize,
222 observed = if (exactFrequencies) NA else candidates$frequency,
223 ignoreCollocateCase = ignoreCollocateCase,
224 withinSpan = withinSpan,
225 ...
226 ) |>
227 filter(O >= minOccur) |>
228 dplyr::arrange(dplyr::desc(logDice))
229 } else {
230 tibble()
231 }
232 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200233
234 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
235 result$label <- rep(vcLabel, nrow(result))
236 }
237
238 threshold_col <- thresholdScore
239 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
240 threshold_values <- result[[threshold_col]]
241 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
242 if (length(eligible_idx) > 0) {
243 recurseWith <- result[eligible_idx, , drop = FALSE]
244 result <- collocationAnalysis(
245 kco,
246 node = paste0("(", buildCollocationQuery(
247 removeWithinSpan(recurseWith$node, withinSpan),
248 recurseWith$collocate,
249 leftContextSize = leftContextSize,
250 rightContextSize = rightContextSize,
251 withinSpan = ""
252 ), ")"),
253 vc = vc,
254 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200255 leftContextSize = leftContextSize,
256 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200257 withinSpan = withinSpan,
258 maxRecurse = maxRecurse - 1,
259 stopwords = stopwords,
260 localStopwords = recurseWith$collocate,
261 exactFrequencies = exactFrequencies,
262 searchHitsSampleLimit = searchHitsSampleLimit,
263 topCollocatesLimit = topCollocatesLimit,
264 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200265 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200266 collocateFilterRegex = collocateFilterRegex,
267 vcLabel = vcLabel
268 ) |>
269 bind_rows(result) |>
270 filter(logDice >= 2) |>
271 filter(O >= minOccur) |>
272 dplyr::arrange(dplyr::desc(logDice))
273 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200274 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200275
276 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200277 result$query <- buildCollocationQuery(
278 result$node,
279 result$collocate,
280 leftContextSize = leftContextSize,
281 rightContextSize = rightContextSize,
282 withinSpan = withinSpan
283 )
284 result$example <- findExample(
285 kco,
286 query = result$query,
287 vc = result$vc
288 )
289 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200290
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200291 result
292 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200293)
294
Marc Kupietz76b05592021-12-19 16:26:15 +0100295# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100296removeWithinSpan <- function(query, withinSpan) {
297 if (withinSpan == "") {
298 return(query)
299 }
300 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200301 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100302 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200303 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100304 return(res)
305}
306
Marc Kupietz9894a372025-10-18 14:51:29 +0200307add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200308 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200309
310 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
311 return(result)
312 }
313
314 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
315 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
316 score_cols <- setdiff(numeric_cols, non_score_cols)
317
318 if (length(score_cols) == 0) {
319 return(result)
320 }
321
Marc Kupietz9894a372025-10-18 14:51:29 +0200322 compute_score_floor <- function(values) {
323 finite_values <- values[is.finite(values)]
324 if (length(finite_values) == 0) {
325 return(0)
326 }
327
328 prob <- min(max(missingScoreQuantile, 0), 0.5)
329 q_val <- suppressWarnings(stats::quantile(finite_values,
330 probs = prob,
331 names = FALSE,
332 type = 7
333 ))
334
335 if (!is.finite(q_val)) {
336 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
337 }
338
339 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
340 if (!is.finite(min_val)) {
341 min_val <- 0
342 }
343
344 spread_candidates <- c(
345 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
346 stats::sd(finite_values, na.rm = TRUE),
347 abs(q_val) * 0.1,
348 abs(min_val - q_val)
349 )
350 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
351
352 spread <- 0
353 if (length(spread_candidates) > 0) {
354 spread <- max(spread_candidates)
355 }
356 if (!is.finite(spread) || spread == 0) {
357 spread <- max(abs(q_val), abs(min_val), 1e-06)
358 }
359
360 candidate <- q_val - spread
361 if (!is.finite(candidate)) {
362 candidate <- min_val
363 }
364
365 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
366 if (!is.finite(floor_value)) {
367 floor_value <- min_val
368 }
369 if (!is.finite(floor_value)) {
370 floor_value <- 0
371 }
372
373 floor_value
374 }
375
376 score_replacements <- stats::setNames(
377 vapply(score_cols, function(col) {
378 compute_score_floor(result[[col]])
379 }, numeric(1)),
380 score_cols
381 )
382
Marc Kupietzc4540a22025-10-14 17:39:53 +0200383 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200384 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
385 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200386 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200387 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200388 names_glue = "{.value}_{make.names(label)}",
389 values_fn = dplyr::first
390 )
391
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200392 raw_labels <- unique(result$label)
393 labels <- make.names(raw_labels)
394 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200395
Marc Kupietz28a29842025-10-18 12:25:09 +0200396 rank_data <- result |>
397 dplyr::distinct(node, collocate)
398
399 for (i in seq_along(raw_labels)) {
400 raw_lab <- raw_labels[i]
401 safe_lab <- labels[i]
402 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
403 if (nrow(label_df) == 0) {
404 next
405 }
406 label_df <- dplyr::distinct(label_df)
407 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
408 for (col in score_cols) {
409 rank_col_name <- paste0("rank_", safe_lab, "_", col)
410 values <- label_df[[col]]
411 ranks <- rep(NA_real_, length(values))
412 valid_idx <- which(!is.na(values))
413 if (length(valid_idx) > 0) {
414 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
415 }
416 rank_tbl[[rank_col_name]] <- ranks
417 }
418 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
419 }
420
421 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
422
423 rank_replacements <- numeric(0)
424 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
425 if (length(rank_column_names) > 0) {
426 rank_replacements <- stats::setNames(
427 vapply(rank_column_names, function(col) {
428 col_values <- comparison[[col]]
429 valid_values <- col_values[!is.na(col_values)]
430 if (length(valid_values) == 0) {
431 nrow(comparison) + 1
432 } else {
433 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
434 }
435 }, numeric(1)),
436 rank_column_names
437 )
438 }
439
440 collapse_label_values <- function(indices, safe_labels_vec) {
441 if (length(indices) == 0) {
442 return(NA_character_)
443 }
444 labs <- label_map[safe_labels_vec[indices]]
445 fallback <- safe_labels_vec[indices]
446 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
447 labs <- labs[!is.na(labs) & labs != ""]
448 if (length(labs) == 0) {
449 return(NA_character_)
450 }
451 paste(unique(labs), collapse = ", ")
452 }
453
Marc Kupietzc4540a22025-10-14 17:39:53 +0200454 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200455 fill_scores <- function(x, y, measure_col) {
456 replacement <- score_replacements[[measure_col]]
457 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
458 if (!is.finite(fallback_min)) {
459 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200460 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200461 if (!is.null(replacement) && is.finite(replacement)) {
462 replacement <- min(replacement, fallback_min)
463 } else {
464 replacement <- fallback_min
465 }
466 if (!is.finite(replacement)) {
467 replacement <- 0
468 }
469 if (any(is.na(x))) {
470 x[is.na(x)] <- replacement
471 }
472 if (any(is.na(y))) {
473 y[is.na(y)] <- replacement
474 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200475 list(x = x, y = y)
476 }
477
Marc Kupietz28a29842025-10-18 12:25:09 +0200478 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
479 fallback <- nrow(comparison) + 1
480 replacement_left <- rank_replacements[[left_rank_col]]
481 if (is.null(replacement_left) || !is.finite(replacement_left)) {
482 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200483 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200484 replacement_right <- rank_replacements[[right_rank_col]]
485 if (is.null(replacement_right) || !is.finite(replacement_right)) {
486 replacement_right <- fallback
487 }
488 if (any(is.na(x))) {
489 x[is.na(x)] <- replacement_left
490 }
491 if (any(is.na(y))) {
492 y[is.na(y)] <- replacement_right
493 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200494 list(x = x, y = y)
495 }
496
497 left_label <- labels[1]
498 right_label <- labels[2]
499
500 for (col in score_cols) {
501 left_col <- paste0(col, "_", left_label)
502 right_col <- paste0(col, "_", right_label)
503 if (!all(c(left_col, right_col) %in% names(comparison))) {
504 next
505 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200506 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200507 comparison[[left_col]] <- filled$x
508 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200509 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200510 rank_left <- paste0("rank_", left_label, "_", col)
511 rank_right <- paste0("rank_", right_label, "_", col)
512 if (all(c(rank_left, rank_right) %in% names(comparison))) {
513 filled_rank <- fill_ranks(
514 comparison[[rank_left]],
515 comparison[[rank_right]],
516 rank_left,
517 rank_right
518 )
519 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
520 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200521 }
522 }
523
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200524 for (col in score_cols) {
525 value_cols <- paste0(col, "_", labels)
526 existing <- value_cols %in% names(comparison)
527 if (!any(existing)) {
528 next
529 }
530 value_cols <- value_cols[existing]
531 safe_labels <- labels[existing]
532
533 score_values <- comparison[, value_cols, drop = FALSE]
534
535 winner_label_col <- paste0("winner_", col)
536 winner_value_col <- paste0("winner_", col, "_value")
537 runner_label_col <- paste0("runner_up_", col)
538 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200539 loser_label_col <- paste0("loser_", col)
540 loser_value_col <- paste0("loser_", col, "_value")
541 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200542
543 if (nrow(score_values) == 0) {
544 comparison[[winner_label_col]] <- character(0)
545 comparison[[winner_value_col]] <- numeric(0)
546 comparison[[runner_label_col]] <- character(0)
547 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200548 comparison[[loser_label_col]] <- character(0)
549 comparison[[loser_value_col]] <- numeric(0)
550 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200551 next
552 }
553
554 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200555 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200556
Marc Kupietzb2862d42025-10-18 10:17:49 +0200557 n_rows <- nrow(score_matrix)
558 winner_labels <- rep(NA_character_, n_rows)
559 winner_values <- rep(NA_real_, n_rows)
560 runner_labels <- rep(NA_character_, n_rows)
561 runner_values <- rep(NA_real_, n_rows)
562 loser_labels <- rep(NA_character_, n_rows)
563 loser_values <- rep(NA_real_, n_rows)
564 max_deltas <- rep(NA_real_, n_rows)
565
Marc Kupietzb2862d42025-10-18 10:17:49 +0200566 if (n_rows > 0) {
567 for (i in seq_len(n_rows)) {
568 numeric_row <- as.numeric(score_matrix[i, ])
569 if (all(is.na(numeric_row))) {
570 next
571 }
572
Marc Kupietz9894a372025-10-18 14:51:29 +0200573 replacement <- score_replacements[[col]]
574 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
575 if (!is.finite(fallback_min)) {
576 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200577 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200578 if (!is.null(replacement) && is.finite(replacement)) {
579 replacement <- min(replacement, fallback_min)
580 } else {
581 replacement <- fallback_min
582 }
583 if (!is.finite(replacement)) {
584 replacement <- 0
585 }
586 if (any(is.na(numeric_row))) {
587 numeric_row[is.na(numeric_row)] <- replacement
588 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200589 score_matrix[i, ] <- numeric_row
590
591 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
592 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200593 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200594 winner_values[i] <- max_val
595
596 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
597 if (length(unique_vals) >= 2) {
598 runner_val <- unique_vals[2]
599 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200600 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200601 runner_values[i] <- runner_val
602 }
603
604 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
605 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200606 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200607 loser_values[i] <- min_val
608
609 if (is.finite(max_val) && is.finite(min_val)) {
610 max_deltas[i] <- max_val - min_val
611 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200612 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200613 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200614
Marc Kupietzb2862d42025-10-18 10:17:49 +0200615 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200616 comparison[[winner_label_col]] <- winner_labels
617 comparison[[winner_value_col]] <- winner_values
618 comparison[[runner_label_col]] <- runner_labels
619 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200620 comparison[[loser_label_col]] <- loser_labels
621 comparison[[loser_value_col]] <- loser_values
622 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200623 }
624
Marc Kupietz28a29842025-10-18 12:25:09 +0200625 for (col in score_cols) {
626 rank_cols <- paste0("rank_", labels, "_", col)
627 existing <- rank_cols %in% names(comparison)
628 if (!any(existing)) {
629 next
630 }
631 rank_cols <- rank_cols[existing]
632 safe_labels <- labels[existing]
633 rank_values <- comparison[, rank_cols, drop = FALSE]
634
635 winner_rank_label_col <- paste0("winner_rank_", col)
636 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
637 runner_rank_label_col <- paste0("runner_up_rank_", col)
638 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
639 loser_rank_label_col <- paste0("loser_rank_", col)
640 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
641 max_delta_rank_col <- paste0("max_delta_rank_", col)
642
643 if (nrow(rank_values) == 0) {
644 comparison[[winner_rank_label_col]] <- character(0)
645 comparison[[winner_rank_value_col]] <- numeric(0)
646 comparison[[runner_rank_label_col]] <- character(0)
647 comparison[[runner_rank_value_col]] <- numeric(0)
648 comparison[[loser_rank_label_col]] <- character(0)
649 comparison[[loser_rank_value_col]] <- numeric(0)
650 comparison[[max_delta_rank_col]] <- numeric(0)
651 next
652 }
653
654 rank_matrix <- as.matrix(rank_values)
655 storage.mode(rank_matrix) <- "numeric"
656
657 n_rows <- nrow(rank_matrix)
658 winner_labels <- rep(NA_character_, n_rows)
659 winner_values <- rep(NA_real_, n_rows)
660 runner_labels <- rep(NA_character_, n_rows)
661 runner_values <- rep(NA_real_, n_rows)
662 loser_labels <- rep(NA_character_, n_rows)
663 loser_values <- rep(NA_real_, n_rows)
664 max_deltas <- rep(NA_real_, n_rows)
665
666 for (i in seq_len(n_rows)) {
667 numeric_row <- as.numeric(rank_matrix[i, ])
668 if (all(is.na(numeric_row))) {
669 next
670 }
671
672 if (length(rank_cols) > 0) {
673 replacement_vec <- rank_replacements[rank_cols]
674 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
675 missing_idx <- which(is.na(numeric_row))
676 if (length(missing_idx) > 0) {
677 numeric_row[missing_idx] <- replacement_vec[missing_idx]
678 }
679 }
680
681 valid_idx <- seq_along(numeric_row)
682 valid_values <- numeric_row[valid_idx]
683 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
684 min_positions <- valid_idx[which(valid_values == min_val)]
685 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
686 winner_values[i] <- min_val
687
688 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
689 if (length(ordered_vals) >= 2) {
690 runner_val <- ordered_vals[2]
691 runner_positions <- valid_idx[which(valid_values == runner_val)]
692 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
693 runner_values[i] <- runner_val
694 }
695
696 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
697 max_positions <- valid_idx[which(valid_values == max_val)]
698 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
699 loser_values[i] <- max_val
700
701 if (is.finite(max_val) && is.finite(min_val)) {
702 max_deltas[i] <- max_val - min_val
703 }
704 }
705
706 comparison[[winner_rank_label_col]] <- winner_labels
707 comparison[[winner_rank_value_col]] <- winner_values
708 comparison[[runner_rank_label_col]] <- runner_labels
709 comparison[[runner_rank_value_col]] <- runner_values
710 comparison[[loser_rank_label_col]] <- loser_labels
711 comparison[[loser_rank_value_col]] <- loser_values
712 comparison[[max_delta_rank_col]] <- max_deltas
713 }
714
Marc Kupietzc4540a22025-10-14 17:39:53 +0200715 dplyr::left_join(result, comparison, by = c("node", "collocate"))
716}
717
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200718#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200719#' @importFrom stringr str_detect
720#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
721#'
722matches2FreqTable <- function(matches,
723 index = 0,
724 minOccur = 5,
725 leftContextSize = 5,
726 rightContextSize = 5,
727 ignoreCollocateCase = FALSE,
728 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200729 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200730 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
731 verbose = TRUE) {
732 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
733 frequency <- NULL
734
735 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200736 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200737 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200738 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200739 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200740 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
741 ignoreCollocateCase = ignoreCollocateCase,
742 stopwords = stopwords, oldTable = oldTable, verbose = verbose
743 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200744 }
745 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200746 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200747 oldTable <- matches2FreqTable(
748 matches,
749 i,
750 leftContextSize = leftContextSize,
751 rightContextSize = rightContextSize,
752 collocateFilterRegex = collocateFilterRegex,
753 oldTable = oldTable,
754 stopwords = stopwords
755 )
756 }
757 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200758 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100759 group_by(word) |>
760 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200761 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200762 arrange(desc(frequency))
763 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200764 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200765
766 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
767
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200768 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200769
770 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
771
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200772 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200773
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200774 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200775 oldTable
776 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100777 table(c(left, right)) |>
778 dplyr::as_tibble(.name_repair = "minimal") |>
779 dplyr::rename(word = 1, frequency = 2) |>
780 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200781 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200782 dplyr::bind_rows(oldTable)
783 }
784 }
785}
786
787#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200788#' @importFrom stringr str_match str_split str_detect
789#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
790#'
791snippet2FreqTable <- function(snippet,
792 minOccur = 5,
793 leftContextSize = 5,
794 rightContextSize = 5,
795 ignoreCollocateCase = FALSE,
796 stopwords = c(),
797 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200798 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200799 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
800 verbose = TRUE) {
801 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
802 frequency <- NULL
803
804 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200805 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200806 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200807 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200808 for (s in snippet) {
809 oldTable <- snippet2FreqTable(
810 s,
811 leftContextSize = leftContextSize,
812 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100813 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200814 oldTable = oldTable,
815 stopwords = stopwords
816 )
817 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200818 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200819 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100820 group_by(word) |>
821 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200822 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200823 arrange(desc(frequency))
824 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200825 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200826 match <-
827 str_match(
828 snippet,
829 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
830 )
831
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200832 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200833 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200834 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200835 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200836 }
837 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200838
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200839 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200840 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200841 } else {
842 ""
843 }
844 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200845
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200846 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200847 oldTable
848 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100849 table(c(left, right)) |>
850 dplyr::as_tibble(.name_repair = "minimal") |>
851 dplyr::rename(word = 1, frequency = 2) |>
852 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200853 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200854 dplyr::bind_rows(oldTable)
855 }
856 }
857}
858
859#' Preliminary synsemantic stopwords function
860#'
861#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200862#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200863#'
864#' Preliminary synsemantic stopwords function to be used in collocation analysis.
865#'
866#' @details
867#' Currently only suitable for German. See stopwords package for other languages.
868#'
869#' @param ... future arguments for language detection
870#'
871#' @family collocation analysis functions
872#' @return Vector of synsemantic stopwords.
873#' @export
874synsemanticStopwords <- function(...) {
875 res <- c(
876 "der",
877 "die",
878 "und",
879 "in",
880 "den",
881 "von",
882 "mit",
883 "das",
884 "zu",
885 "im",
886 "ist",
887 "auf",
888 "sich",
889 "Die",
890 "des",
891 "dem",
892 "nicht",
893 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100894 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200895 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100896 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200897 "es",
898 "auch",
899 "an",
900 "als",
901 "am",
902 "aus",
903 "Der",
904 "bei",
905 "er",
906 "dass",
907 "sie",
908 "nach",
909 "um",
910 "Das",
911 "zum",
912 "noch",
913 "war",
914 "einen",
915 "einer",
916 "wie",
917 "einem",
918 "vor",
919 "bis",
920 "\u00fcber",
921 "so",
922 "aber",
923 "Eine",
924 "diese",
925 "Diese",
926 "oder"
927 )
928 return(res)
929}
930
Marc Kupietz5a336b62021-11-27 17:51:35 +0100931
Marc Kupietz76b05592021-12-19 16:26:15 +0100932# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100933findExample <-
934 function(kco,
935 query,
936 vc = "",
937 matchOnly = TRUE) {
938 out <- character(length = length(query))
939
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200940 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100941 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200942 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100943
944 for (i in seq_along(query)) {
945 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100946 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200947 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100948 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200949 out[i] <- if (matchOnly) {
950 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100951 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200952 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100953 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100954 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200955 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100956 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100957 }
958 out
959 }
960
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200961collocatesQuery <-
962 function(kco,
963 query,
964 vc = "",
965 minOccur = 5,
966 leftContextSize = 5,
967 rightContextSize = 5,
968 searchHitsSampleLimit = 20000,
969 ignoreCollocateCase = FALSE,
970 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200971 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200972 ...) {
973 frequency <- NULL
974 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200975 if (q@totalResults == 0) {
976 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200977 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200978 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
979 matches2FreqTable(q@collectedMatches,
980 0,
981 minOccur = minOccur,
982 leftContextSize = leftContextSize,
983 rightContextSize = rightContextSize,
984 ignoreCollocateCase = ignoreCollocateCase,
985 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200986 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200987 ...,
988 verbose = kco@verbose
989 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100990 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200991 filter(frequency >= minOccur)
992 }
993 }