blob: bf4318ff4fac441d48a50dd9a7bd6ae0aefb4507 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
49#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
50#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020051#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020052#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020053#' @importFrom tidyr expand_grid pivot_wider
54#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#'
56#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020059#' # Find top collocates of "Packung" inside and outside the sports domain.
60#' KorAPConnection(verbose = TRUE) |>
61#' collocationAnalysis("Packung",
62#' vc = c("textClass=sport", "textClass!=sport"),
63#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
64#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' dplyr::filter(logDice >= 5)
66#' }
67#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020068#' \dontrun{
69#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' # Identify the most prominent light verb construction with "in ... setzen".
71#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010072#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020074#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
75#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020076#' }
77#'
78#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020079setMethod(
80 "collocationAnalysis", "KorAPConnection",
81 function(kco,
82 node,
83 vc = "",
84 lemmatizeNodeQuery = FALSE,
85 minOccur = 5,
86 leftContextSize = 5,
87 rightContextSize = 5,
88 topCollocatesLimit = 200,
89 searchHitsSampleLimit = 20000,
90 ignoreCollocateCase = FALSE,
91 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
92 exactFrequencies = TRUE,
93 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
94 seed = 7,
95 expand = length(vc) != length(node),
96 maxRecurse = 0,
97 addExamples = FALSE,
98 thresholdScore = "logDice",
99 threshold = 2.0,
100 localStopwords = c(),
101 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200102 multiVcMissingScoreFactor = 0.9,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200103 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200104 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200105 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200106
Marc Kupietzb2862d42025-10-18 10:17:49 +0200107 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200108 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
109 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200110
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200111 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200112
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200113 if (lemmatizeNodeQuery) {
114 node <- lemmatizeWordQuery(node)
115 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200116
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200118 if (is.null(vcNames)) {
119 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200120 }
121
122 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200123 if (!is.null(names(vc)) && length(vc) > 0) {
124 raw_names <- names(vc)
125 if (any(!is.na(raw_names) & raw_names != "")) {
126 label_lookup <- stats::setNames(raw_names, vc)
127 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200128 }
129
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200130 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200132 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
133 tmp_grid$vc <- vc[tmp_grid$idx]
134 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
135 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200136 } else {
137 tibble(node = node, vc = vc, vcLabel = vcNames)
138 }
139
140 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200141 collocationAnalysis(kco,
142 node = node,
143 vc = vc,
144 minOccur = minOccur,
145 leftContextSize = leftContextSize,
146 rightContextSize = rightContextSize,
147 topCollocatesLimit = topCollocatesLimit,
148 searchHitsSampleLimit = searchHitsSampleLimit,
149 ignoreCollocateCase = ignoreCollocateCase,
150 withinSpan = withinSpan,
151 exactFrequencies = exactFrequencies,
152 stopwords = stopwords,
153 addExamples = TRUE,
154 localStopwords = localStopwords,
155 seed = seed,
156 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200157 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200158 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200159 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200160 ...
161 )
162 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200163 bind_rows()
164
165 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
166 multi_result
167 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200168 if (!"label" %in% names(multi_result)) {
169 multi_result$label <- NA_character_
170 }
171
172 if (!is.null(label_lookup)) {
173 override <- unname(label_lookup[multi_result$vc])
174 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
175 if (any(missing_idx)) {
176 multi_result$label[missing_idx] <- override[missing_idx]
177 }
178 }
179
180 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
181 if (any(missing_idx)) {
182 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
183 }
184
Marc Kupietze31322e2025-10-17 18:55:36 +0200185 multi_result |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200186 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
187 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200188 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200189 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
190 vcLabel <- vcNames[1]
191 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200192
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200193 set.seed(seed)
194 candidates <- collocatesQuery(
195 kco,
196 node,
197 vc = vc,
198 minOccur = minOccur,
199 leftContextSize = leftContextSize,
200 rightContextSize = rightContextSize,
201 searchHitsSampleLimit = searchHitsSampleLimit,
202 ignoreCollocateCase = ignoreCollocateCase,
203 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200204 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200205 ...
206 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200207
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200208 if (nrow(candidates) > 0) {
209 candidates <- candidates |>
210 filter(frequency >= minOccur) |>
211 slice_head(n = topCollocatesLimit)
212 collocationScoreQuery(
213 kco,
214 node = node,
215 collocate = candidates$word,
216 vc = vc,
217 leftContextSize = leftContextSize,
218 rightContextSize = rightContextSize,
219 observed = if (exactFrequencies) NA else candidates$frequency,
220 ignoreCollocateCase = ignoreCollocateCase,
221 withinSpan = withinSpan,
222 ...
223 ) |>
224 filter(O >= minOccur) |>
225 dplyr::arrange(dplyr::desc(logDice))
226 } else {
227 tibble()
228 }
229 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200230
231 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
232 result$label <- rep(vcLabel, nrow(result))
233 }
234
235 threshold_col <- thresholdScore
236 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
237 threshold_values <- result[[threshold_col]]
238 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
239 if (length(eligible_idx) > 0) {
240 recurseWith <- result[eligible_idx, , drop = FALSE]
241 result <- collocationAnalysis(
242 kco,
243 node = paste0("(", buildCollocationQuery(
244 removeWithinSpan(recurseWith$node, withinSpan),
245 recurseWith$collocate,
246 leftContextSize = leftContextSize,
247 rightContextSize = rightContextSize,
248 withinSpan = ""
249 ), ")"),
250 vc = vc,
251 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200252 leftContextSize = leftContextSize,
253 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200254 withinSpan = withinSpan,
255 maxRecurse = maxRecurse - 1,
256 stopwords = stopwords,
257 localStopwords = recurseWith$collocate,
258 exactFrequencies = exactFrequencies,
259 searchHitsSampleLimit = searchHitsSampleLimit,
260 topCollocatesLimit = topCollocatesLimit,
261 addExamples = FALSE,
262 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
263 collocateFilterRegex = collocateFilterRegex,
264 vcLabel = vcLabel
265 ) |>
266 bind_rows(result) |>
267 filter(logDice >= 2) |>
268 filter(O >= minOccur) |>
269 dplyr::arrange(dplyr::desc(logDice))
270 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200271 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200272
273 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200274 result$query <- buildCollocationQuery(
275 result$node,
276 result$collocate,
277 leftContextSize = leftContextSize,
278 rightContextSize = rightContextSize,
279 withinSpan = withinSpan
280 )
281 result$example <- findExample(
282 kco,
283 query = result$query,
284 vc = result$vc
285 )
286 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200287
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200288 result
289 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200290)
291
Marc Kupietz76b05592021-12-19 16:26:15 +0100292# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100293removeWithinSpan <- function(query, withinSpan) {
294 if (withinSpan == "") {
295 return(query)
296 }
297 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200298 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100299 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200300 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100301 return(res)
302}
303
Marc Kupietzc4540a22025-10-14 17:39:53 +0200304add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
305 label <- node <- collocate <- rankWithinLabel <- NULL
306
307 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
308 return(result)
309 }
310
311 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
312 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
313 score_cols <- setdiff(numeric_cols, non_score_cols)
314
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200315 score_cols <- setdiff(score_cols, "rankWithinLabel")
316
Marc Kupietzc4540a22025-10-14 17:39:53 +0200317 if (length(score_cols) == 0) {
318 return(result)
319 }
320
321 ranking_col <- thresholdScore
322 if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
323 ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
324 }
325
326 ranking_sym <- rlang::sym(ranking_col)
327
328 result <- result |>
329 dplyr::group_by(label) |>
330 dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
331 dplyr::ungroup()
332
333 comparison <- result |>
334 dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
335 pivot_wider(
336 names_from = label,
337 values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
338 names_glue = "{.value}_{make.names(label)}",
339 values_fn = dplyr::first
340 )
341
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200342 raw_labels <- unique(result$label)
343 labels <- make.names(raw_labels)
344 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200345
346 if (length(labels) == 2) {
347 fill_scores <- function(x, y) {
348 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
349 if (!is.finite(min_val)) {
350 min_val <- 0
351 }
352 x[is.na(x)] <- missingScoreFactor * min_val
353 y[is.na(y)] <- missingScoreFactor * min_val
354 list(x = x, y = y)
355 }
356
357 fill_ranks <- function(x, y) {
358 max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
359 if (!is.finite(max_val)) {
360 max_val <- 0
361 }
362 x[is.na(x)] <- max_val + 1
363 y[is.na(y)] <- max_val + 1
364 list(x = x, y = y)
365 }
366
367 left_label <- labels[1]
368 right_label <- labels[2]
369
370 for (col in score_cols) {
371 left_col <- paste0(col, "_", left_label)
372 right_col <- paste0(col, "_", right_label)
373 if (!all(c(left_col, right_col) %in% names(comparison))) {
374 next
375 }
376 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200377 comparison[[left_col]] <- filled$x
378 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200379 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
380 }
381
382 left_rank <- paste0("rankWithinLabel_", left_label)
383 right_rank <- paste0("rankWithinLabel_", right_label)
384 if (all(c(left_rank, right_rank) %in% names(comparison))) {
385 filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200386 comparison[[left_rank]] <- filled_rank$x
387 comparison[[right_rank]] <- filled_rank$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200388 comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
389 }
390 }
391
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200392 for (col in score_cols) {
393 value_cols <- paste0(col, "_", labels)
394 existing <- value_cols %in% names(comparison)
395 if (!any(existing)) {
396 next
397 }
398 value_cols <- value_cols[existing]
399 safe_labels <- labels[existing]
400
401 score_values <- comparison[, value_cols, drop = FALSE]
402
403 winner_label_col <- paste0("winner_", col)
404 winner_value_col <- paste0("winner_", col, "_value")
405 runner_label_col <- paste0("runner_up_", col)
406 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200407 loser_label_col <- paste0("loser_", col)
408 loser_value_col <- paste0("loser_", col, "_value")
409 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200410
411 if (nrow(score_values) == 0) {
412 comparison[[winner_label_col]] <- character(0)
413 comparison[[winner_value_col]] <- numeric(0)
414 comparison[[runner_label_col]] <- character(0)
415 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200416 comparison[[loser_label_col]] <- character(0)
417 comparison[[loser_value_col]] <- numeric(0)
418 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200419 next
420 }
421
422 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200423 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200424
Marc Kupietzb2862d42025-10-18 10:17:49 +0200425 n_rows <- nrow(score_matrix)
426 winner_labels <- rep(NA_character_, n_rows)
427 winner_values <- rep(NA_real_, n_rows)
428 runner_labels <- rep(NA_character_, n_rows)
429 runner_values <- rep(NA_real_, n_rows)
430 loser_labels <- rep(NA_character_, n_rows)
431 loser_values <- rep(NA_real_, n_rows)
432 max_deltas <- rep(NA_real_, n_rows)
433
434 collapse_labels <- function(indices) {
435 if (length(indices) == 0) {
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200436 return(NA_character_)
437 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200438 labs <- label_map[safe_labels[indices]]
439 fallback <- safe_labels[indices]
440 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
441 labs <- labs[!is.na(labs) & labs != ""]
442 if (length(labs) == 0) {
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200443 return(NA_character_)
444 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200445 paste(unique(labs), collapse = ", ")
446 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200447
Marc Kupietzb2862d42025-10-18 10:17:49 +0200448 if (n_rows > 0) {
449 for (i in seq_len(n_rows)) {
450 numeric_row <- as.numeric(score_matrix[i, ])
451 if (all(is.na(numeric_row))) {
452 next
453 }
454
455 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
456 if (!is.finite(min_val)) {
457 min_val <- 0
458 }
459 numeric_row[is.na(numeric_row)] <- missingScoreFactor * min_val
460 score_matrix[i, ] <- numeric_row
461
462 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
463 max_idx <- which(numeric_row == max_val)
464 winner_labels[i] <- collapse_labels(max_idx)
465 winner_values[i] <- max_val
466
467 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
468 if (length(unique_vals) >= 2) {
469 runner_val <- unique_vals[2]
470 runner_idx <- which(numeric_row == runner_val)
471 runner_labels[i] <- collapse_labels(runner_idx)
472 runner_values[i] <- runner_val
473 }
474
475 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
476 min_idx <- which(numeric_row == min_val)
477 loser_labels[i] <- collapse_labels(min_idx)
478 loser_values[i] <- min_val
479
480 if (is.finite(max_val) && is.finite(min_val)) {
481 max_deltas[i] <- max_val - min_val
482 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200483 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200484 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200485
Marc Kupietzb2862d42025-10-18 10:17:49 +0200486 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200487 comparison[[winner_label_col]] <- winner_labels
488 comparison[[winner_value_col]] <- winner_values
489 comparison[[runner_label_col]] <- runner_labels
490 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200491 comparison[[loser_label_col]] <- loser_labels
492 comparison[[loser_value_col]] <- loser_values
493 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200494 }
495
Marc Kupietzc4540a22025-10-14 17:39:53 +0200496 dplyr::left_join(result, comparison, by = c("node", "collocate"))
497}
498
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200499#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200500#' @importFrom stringr str_detect
501#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
502#'
503matches2FreqTable <- function(matches,
504 index = 0,
505 minOccur = 5,
506 leftContextSize = 5,
507 rightContextSize = 5,
508 ignoreCollocateCase = FALSE,
509 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200510 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200511 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
512 verbose = TRUE) {
513 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
514 frequency <- NULL
515
516 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200517 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200518 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200519 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200520 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200521 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
522 ignoreCollocateCase = ignoreCollocateCase,
523 stopwords = stopwords, oldTable = oldTable, verbose = verbose
524 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200525 }
526 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200527 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200528 oldTable <- matches2FreqTable(
529 matches,
530 i,
531 leftContextSize = leftContextSize,
532 rightContextSize = rightContextSize,
533 collocateFilterRegex = collocateFilterRegex,
534 oldTable = oldTable,
535 stopwords = stopwords
536 )
537 }
538 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200539 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100540 group_by(word) |>
541 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200542 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200543 arrange(desc(frequency))
544 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200545 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200546
547 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
548
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200549 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200550
551 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
552
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200553 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200554
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200555 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200556 oldTable
557 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100558 table(c(left, right)) |>
559 dplyr::as_tibble(.name_repair = "minimal") |>
560 dplyr::rename(word = 1, frequency = 2) |>
561 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200562 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200563 dplyr::bind_rows(oldTable)
564 }
565 }
566}
567
568#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200569#' @importFrom stringr str_match str_split str_detect
570#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
571#'
572snippet2FreqTable <- function(snippet,
573 minOccur = 5,
574 leftContextSize = 5,
575 rightContextSize = 5,
576 ignoreCollocateCase = FALSE,
577 stopwords = c(),
578 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200579 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200580 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
581 verbose = TRUE) {
582 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
583 frequency <- NULL
584
585 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200586 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200587 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200588 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200589 for (s in snippet) {
590 oldTable <- snippet2FreqTable(
591 s,
592 leftContextSize = leftContextSize,
593 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100594 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200595 oldTable = oldTable,
596 stopwords = stopwords
597 )
598 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200599 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200600 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100601 group_by(word) |>
602 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200603 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200604 arrange(desc(frequency))
605 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200606 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200607 match <-
608 str_match(
609 snippet,
610 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
611 )
612
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200613 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200614 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200615 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200616 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200617 }
618 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200619
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200620 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200621 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200622 } else {
623 ""
624 }
625 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200626
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200627 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200628 oldTable
629 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100630 table(c(left, right)) |>
631 dplyr::as_tibble(.name_repair = "minimal") |>
632 dplyr::rename(word = 1, frequency = 2) |>
633 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200634 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200635 dplyr::bind_rows(oldTable)
636 }
637 }
638}
639
640#' Preliminary synsemantic stopwords function
641#'
642#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200643#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200644#'
645#' Preliminary synsemantic stopwords function to be used in collocation analysis.
646#'
647#' @details
648#' Currently only suitable for German. See stopwords package for other languages.
649#'
650#' @param ... future arguments for language detection
651#'
652#' @family collocation analysis functions
653#' @return Vector of synsemantic stopwords.
654#' @export
655synsemanticStopwords <- function(...) {
656 res <- c(
657 "der",
658 "die",
659 "und",
660 "in",
661 "den",
662 "von",
663 "mit",
664 "das",
665 "zu",
666 "im",
667 "ist",
668 "auf",
669 "sich",
670 "Die",
671 "des",
672 "dem",
673 "nicht",
674 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100675 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200676 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100677 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200678 "es",
679 "auch",
680 "an",
681 "als",
682 "am",
683 "aus",
684 "Der",
685 "bei",
686 "er",
687 "dass",
688 "sie",
689 "nach",
690 "um",
691 "Das",
692 "zum",
693 "noch",
694 "war",
695 "einen",
696 "einer",
697 "wie",
698 "einem",
699 "vor",
700 "bis",
701 "\u00fcber",
702 "so",
703 "aber",
704 "Eine",
705 "diese",
706 "Diese",
707 "oder"
708 )
709 return(res)
710}
711
Marc Kupietz5a336b62021-11-27 17:51:35 +0100712
Marc Kupietz76b05592021-12-19 16:26:15 +0100713# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100714findExample <-
715 function(kco,
716 query,
717 vc = "",
718 matchOnly = TRUE) {
719 out <- character(length = length(query))
720
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200721 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100722 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200723 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100724
725 for (i in seq_along(query)) {
726 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100727 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200728 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100729 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200730 out[i] <- if (matchOnly) {
731 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100732 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200733 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100734 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100735 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200736 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100737 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100738 }
739 out
740 }
741
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200742collocatesQuery <-
743 function(kco,
744 query,
745 vc = "",
746 minOccur = 5,
747 leftContextSize = 5,
748 rightContextSize = 5,
749 searchHitsSampleLimit = 20000,
750 ignoreCollocateCase = FALSE,
751 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200752 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200753 ...) {
754 frequency <- NULL
755 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200756 if (q@totalResults == 0) {
757 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200758 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200759 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
760 matches2FreqTable(q@collectedMatches,
761 0,
762 minOccur = minOccur,
763 leftContextSize = leftContextSize,
764 rightContextSize = rightContextSize,
765 ignoreCollocateCase = ignoreCollocateCase,
766 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200767 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200768 ...,
769 verbose = kco@verbose
770 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100771 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200772 filter(frequency >= minOccur)
773 }
774 }