blob: 56d7923e6667310fdbcd5daeca379ec15dc9c8b5 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
49#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
50#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020051#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020052#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020053#' @importFrom tidyr expand_grid pivot_wider
54#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#'
56#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020059#' # Find top collocates of "Packung" inside and outside the sports domain.
60#' KorAPConnection(verbose = TRUE) |>
61#' collocationAnalysis("Packung",
62#' vc = c("textClass=sport", "textClass!=sport"),
63#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
64#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' dplyr::filter(logDice >= 5)
66#' }
67#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020068#' \dontrun{
69#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' # Identify the most prominent light verb construction with "in ... setzen".
71#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010072#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020074#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
75#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020076#' }
77#'
78#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020079setMethod(
80 "collocationAnalysis", "KorAPConnection",
81 function(kco,
82 node,
83 vc = "",
84 lemmatizeNodeQuery = FALSE,
85 minOccur = 5,
86 leftContextSize = 5,
87 rightContextSize = 5,
88 topCollocatesLimit = 200,
89 searchHitsSampleLimit = 20000,
90 ignoreCollocateCase = FALSE,
91 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
92 exactFrequencies = TRUE,
93 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
94 seed = 7,
95 expand = length(vc) != length(node),
96 maxRecurse = 0,
97 addExamples = FALSE,
98 thresholdScore = "logDice",
99 threshold = 2.0,
100 localStopwords = c(),
101 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200102 multiVcMissingScoreFactor = 0.9,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200103 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200104 ...) {
105 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze34a8be2025-10-17 20:13:42 +0200106 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200107
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200108 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
109 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
110 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200111
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200112 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200113
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200114 if (lemmatizeNodeQuery) {
115 node <- lemmatizeWordQuery(node)
116 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200117
Marc Kupietze34a8be2025-10-17 20:13:42 +0200118 vcNames <- names(vc)
119 vc <- unname(unlist(vc, use.names = FALSE))
120 if (is.null(vcNames)) {
121 vcNames <- rep(NA_character_, length(vc))
122 } else {
123 vcNames[vcNames == ""] <- NA_character_
124 if (length(vcNames) < length(vc)) {
125 vcNames <- rep(vcNames, length.out = length(vc))
126 }
127 }
128
129 label_lookup <- NULL
130 if (length(vc) > 0 && any(!is.na(vcNames))) {
131 valid_lookup <- !is.na(vcNames)
132 label_lookup <- vcNames[valid_lookup]
133 names(label_lookup) <- vc[valid_lookup]
134 }
135
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200136 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200137 grid <- if (expand) {
138 tmp_grid <- expand_grid(node = node, idx = seq_along(vc))
139 tmp_grid$vc <- vc[tmp_grid$idx]
140 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
141 tmp_grid[, setdiff(names(tmp_grid), "idx"), drop = FALSE]
142 } else {
143 tibble(node = node, vc = vc, vcLabel = vcNames)
144 }
145
146 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200147 collocationAnalysis(kco,
148 node = node,
149 vc = vc,
150 minOccur = minOccur,
151 leftContextSize = leftContextSize,
152 rightContextSize = rightContextSize,
153 topCollocatesLimit = topCollocatesLimit,
154 searchHitsSampleLimit = searchHitsSampleLimit,
155 ignoreCollocateCase = ignoreCollocateCase,
156 withinSpan = withinSpan,
157 exactFrequencies = exactFrequencies,
158 stopwords = stopwords,
159 addExamples = TRUE,
160 localStopwords = localStopwords,
161 seed = seed,
162 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200163 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200164 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200165 ...
166 )
167 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200168 bind_rows()
169
170 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
171 multi_result
172 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200173 if (!"label" %in% names(multi_result)) {
174 multi_result$label <- NA_character_
175 }
176
177 if (!is.null(label_lookup)) {
178 override <- unname(label_lookup[multi_result$vc])
179 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
180 if (any(missing_idx)) {
181 multi_result$label[missing_idx] <- override[missing_idx]
182 }
183 }
184
185 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
186 if (any(missing_idx)) {
187 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
188 }
189
Marc Kupietze31322e2025-10-17 18:55:36 +0200190 multi_result |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200191 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
192 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200193 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200194 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
195 vcLabel <- vcNames[1]
196 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200197 set.seed(seed)
198 candidates <- collocatesQuery(
199 kco,
200 node,
201 vc = vc,
202 minOccur = minOccur,
203 leftContextSize = leftContextSize,
204 rightContextSize = rightContextSize,
205 searchHitsSampleLimit = searchHitsSampleLimit,
206 ignoreCollocateCase = ignoreCollocateCase,
207 stopwords = append(stopwords, localStopwords),
208 ...
209 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200210
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200211 if (nrow(candidates) > 0) {
212 candidates <- candidates |>
213 filter(frequency >= minOccur) |>
214 slice_head(n = topCollocatesLimit)
215 collocationScoreQuery(
216 kco,
217 node = node,
218 collocate = candidates$word,
219 vc = vc,
220 leftContextSize = leftContextSize,
221 rightContextSize = rightContextSize,
222 observed = if (exactFrequencies) NA else candidates$frequency,
223 ignoreCollocateCase = ignoreCollocateCase,
224 withinSpan = withinSpan,
225 ...
226 ) |>
227 filter(O >= minOccur) |>
228 dplyr::arrange(dplyr::desc(logDice))
229 } else {
230 tibble()
231 }
232 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200233 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
234 result$label <- rep(vcLabel, nrow(result))
235 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200236 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
237 recurseWith <- result |>
238 filter(!!as.name(thresholdScore) >= threshold)
239 result <- collocationAnalysis(
240 kco,
241 node = paste0("(", buildCollocationQuery(
242 removeWithinSpan(recurseWith$node, withinSpan),
243 recurseWith$collocate,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 withinSpan = ""
247 ), ")"),
248 vc = vc,
249 minOccur = minOccur,
250 leftContextSize = leftContextSize,
251 rightContextSize = rightContextSize,
252 withinSpan = withinSpan,
253 maxRecurse = maxRecurse - 1,
254 stopwords = stopwords,
255 localStopwords = recurseWith$collocate,
256 exactFrequencies = exactFrequencies,
257 searchHitsSampleLimit = searchHitsSampleLimit,
258 topCollocatesLimit = topCollocatesLimit,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200259 addExamples = FALSE,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200260 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
261 vcLabel = vcLabel
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200262 ) |>
263 bind_rows(result) |>
264 filter(logDice >= 2) |>
Marc Kupietze9e18bd2025-06-04 17:15:02 +0200265 filter(O >= minOccur) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200266 dplyr::arrange(dplyr::desc(logDice))
267 }
268 if (addExamples && length(result) > 0) {
269 result$query <- buildCollocationQuery(
270 result$node,
271 result$collocate,
272 leftContextSize = leftContextSize,
273 rightContextSize = rightContextSize,
274 withinSpan = withinSpan
275 )
276 result$example <- findExample(
277 kco,
278 query = result$query,
279 vc = result$vc
280 )
281 }
282 result
283 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200284)
285
Marc Kupietz76b05592021-12-19 16:26:15 +0100286# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100287removeWithinSpan <- function(query, withinSpan) {
288 if (withinSpan == "") {
289 return(query)
290 }
291 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200292 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100293 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200294 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100295 return(res)
296}
297
Marc Kupietzc4540a22025-10-14 17:39:53 +0200298add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
299 label <- node <- collocate <- rankWithinLabel <- NULL
300
301 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
302 return(result)
303 }
304
305 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
306 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
307 score_cols <- setdiff(numeric_cols, non_score_cols)
308
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200309 score_cols <- setdiff(score_cols, "rankWithinLabel")
310
Marc Kupietzc4540a22025-10-14 17:39:53 +0200311 if (length(score_cols) == 0) {
312 return(result)
313 }
314
315 ranking_col <- thresholdScore
316 if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
317 ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
318 }
319
320 ranking_sym <- rlang::sym(ranking_col)
321
322 result <- result |>
323 dplyr::group_by(label) |>
324 dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
325 dplyr::ungroup()
326
327 comparison <- result |>
328 dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
329 pivot_wider(
330 names_from = label,
331 values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
332 names_glue = "{.value}_{make.names(label)}",
333 values_fn = dplyr::first
334 )
335
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200336 raw_labels <- unique(result$label)
337 labels <- make.names(raw_labels)
338 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200339
340 if (length(labels) == 2) {
341 fill_scores <- function(x, y) {
342 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
343 if (!is.finite(min_val)) {
344 min_val <- 0
345 }
346 x[is.na(x)] <- missingScoreFactor * min_val
347 y[is.na(y)] <- missingScoreFactor * min_val
348 list(x = x, y = y)
349 }
350
351 fill_ranks <- function(x, y) {
352 max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
353 if (!is.finite(max_val)) {
354 max_val <- 0
355 }
356 x[is.na(x)] <- max_val + 1
357 y[is.na(y)] <- max_val + 1
358 list(x = x, y = y)
359 }
360
361 left_label <- labels[1]
362 right_label <- labels[2]
363
364 for (col in score_cols) {
365 left_col <- paste0(col, "_", left_label)
366 right_col <- paste0(col, "_", right_label)
367 if (!all(c(left_col, right_col) %in% names(comparison))) {
368 next
369 }
370 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200371 comparison[[left_col]] <- filled$x
372 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200373 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
374 }
375
376 left_rank <- paste0("rankWithinLabel_", left_label)
377 right_rank <- paste0("rankWithinLabel_", right_label)
378 if (all(c(left_rank, right_rank) %in% names(comparison))) {
379 filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200380 comparison[[left_rank]] <- filled_rank$x
381 comparison[[right_rank]] <- filled_rank$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200382 comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
383 }
384 }
385
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200386 for (col in score_cols) {
387 value_cols <- paste0(col, "_", labels)
388 existing <- value_cols %in% names(comparison)
389 if (!any(existing)) {
390 next
391 }
392 value_cols <- value_cols[existing]
393 safe_labels <- labels[existing]
394
395 score_values <- comparison[, value_cols, drop = FALSE]
396
397 winner_label_col <- paste0("winner_", col)
398 winner_value_col <- paste0("winner_", col, "_value")
399 runner_label_col <- paste0("runner_up_", col)
400 runner_value_col <- paste0("runner_up_", col, "_value")
401
402 if (nrow(score_values) == 0) {
403 comparison[[winner_label_col]] <- character(0)
404 comparison[[winner_value_col]] <- numeric(0)
405 comparison[[runner_label_col]] <- character(0)
406 comparison[[runner_value_col]] <- numeric(0)
407 next
408 }
409
410 score_matrix <- as.matrix(score_values)
411
412 winner_labels <- apply(score_matrix, 1, function(row) {
413 row <- as.numeric(row)
414 valid <- which(!is.na(row))
415 if (length(valid) == 0) {
416 return(NA_character_)
417 }
418 ord <- valid[order(row[valid], decreasing = TRUE)]
419 unname(label_map[safe_labels[ord[1]]])
420 })
421 winner_labels <- unname(as.character(winner_labels))
422
423 winner_values <- apply(score_matrix, 1, function(row) {
424 row <- as.numeric(row)
425 if (all(is.na(row))) {
426 return(NA_real_)
427 }
428 max(row, na.rm = TRUE)
429 })
430 winner_values <- unname(as.numeric(winner_values))
431
432 runner_labels <- apply(score_matrix, 1, function(row) {
433 row <- as.numeric(row)
434 valid <- which(!is.na(row))
435 if (length(valid) < 2) {
436 return(NA_character_)
437 }
438 ord <- valid[order(row[valid], decreasing = TRUE)]
439 unname(label_map[safe_labels[ord[2]]])
440 })
441 runner_labels <- unname(as.character(runner_labels))
442
443 runner_values <- apply(score_matrix, 1, function(row) {
444 row <- as.numeric(row)
445 valid <- which(!is.na(row))
446 if (length(valid) < 2) {
447 return(NA_real_)
448 }
449 ord <- valid[order(row[valid], decreasing = TRUE)]
450 row[ord[2]]
451 })
452 runner_values <- unname(as.numeric(runner_values))
453
454 comparison[[winner_label_col]] <- winner_labels
455 comparison[[winner_value_col]] <- winner_values
456 comparison[[runner_label_col]] <- runner_labels
457 comparison[[runner_value_col]] <- runner_values
458 }
459
Marc Kupietzc4540a22025-10-14 17:39:53 +0200460 dplyr::left_join(result, comparison, by = c("node", "collocate"))
461}
462
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200463#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200464#' @importFrom stringr str_detect
465#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
466#'
467matches2FreqTable <- function(matches,
468 index = 0,
469 minOccur = 5,
470 leftContextSize = 5,
471 rightContextSize = 5,
472 ignoreCollocateCase = FALSE,
473 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200474 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200475 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
476 verbose = TRUE) {
477 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
478 frequency <- NULL
479
480 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200481 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200482 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200483 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200484 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200485 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
486 ignoreCollocateCase = ignoreCollocateCase,
487 stopwords = stopwords, oldTable = oldTable, verbose = verbose
488 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200489 }
490 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200491 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200492 oldTable <- matches2FreqTable(
493 matches,
494 i,
495 leftContextSize = leftContextSize,
496 rightContextSize = rightContextSize,
497 collocateFilterRegex = collocateFilterRegex,
498 oldTable = oldTable,
499 stopwords = stopwords
500 )
501 }
502 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200503 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100504 group_by(word) |>
505 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200506 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200507 arrange(desc(frequency))
508 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200509 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200510
511 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
512
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200513 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200514
515 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
516
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200517 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200518
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200519 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200520 oldTable
521 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100522 table(c(left, right)) |>
523 dplyr::as_tibble(.name_repair = "minimal") |>
524 dplyr::rename(word = 1, frequency = 2) |>
525 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200526 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200527 dplyr::bind_rows(oldTable)
528 }
529 }
530}
531
532#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200533#' @importFrom stringr str_match str_split str_detect
534#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
535#'
536snippet2FreqTable <- function(snippet,
537 minOccur = 5,
538 leftContextSize = 5,
539 rightContextSize = 5,
540 ignoreCollocateCase = FALSE,
541 stopwords = c(),
542 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200543 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200544 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
545 verbose = TRUE) {
546 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
547 frequency <- NULL
548
549 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200550 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200551 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200552 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200553 for (s in snippet) {
554 oldTable <- snippet2FreqTable(
555 s,
556 leftContextSize = leftContextSize,
557 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100558 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200559 oldTable = oldTable,
560 stopwords = stopwords
561 )
562 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200563 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200564 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100565 group_by(word) |>
566 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200567 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200568 arrange(desc(frequency))
569 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200570 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200571 match <-
572 str_match(
573 snippet,
574 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
575 )
576
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200577 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200578 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200579 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200580 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200581 }
582 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200583
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200584 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200585 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200586 } else {
587 ""
588 }
589 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200590
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200591 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200592 oldTable
593 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100594 table(c(left, right)) |>
595 dplyr::as_tibble(.name_repair = "minimal") |>
596 dplyr::rename(word = 1, frequency = 2) |>
597 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200598 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200599 dplyr::bind_rows(oldTable)
600 }
601 }
602}
603
604#' Preliminary synsemantic stopwords function
605#'
606#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200607#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200608#'
609#' Preliminary synsemantic stopwords function to be used in collocation analysis.
610#'
611#' @details
612#' Currently only suitable for German. See stopwords package for other languages.
613#'
614#' @param ... future arguments for language detection
615#'
616#' @family collocation analysis functions
617#' @return Vector of synsemantic stopwords.
618#' @export
619synsemanticStopwords <- function(...) {
620 res <- c(
621 "der",
622 "die",
623 "und",
624 "in",
625 "den",
626 "von",
627 "mit",
628 "das",
629 "zu",
630 "im",
631 "ist",
632 "auf",
633 "sich",
634 "Die",
635 "des",
636 "dem",
637 "nicht",
638 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100639 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200640 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100641 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200642 "es",
643 "auch",
644 "an",
645 "als",
646 "am",
647 "aus",
648 "Der",
649 "bei",
650 "er",
651 "dass",
652 "sie",
653 "nach",
654 "um",
655 "Das",
656 "zum",
657 "noch",
658 "war",
659 "einen",
660 "einer",
661 "wie",
662 "einem",
663 "vor",
664 "bis",
665 "\u00fcber",
666 "so",
667 "aber",
668 "Eine",
669 "diese",
670 "Diese",
671 "oder"
672 )
673 return(res)
674}
675
Marc Kupietz5a336b62021-11-27 17:51:35 +0100676
Marc Kupietz76b05592021-12-19 16:26:15 +0100677# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100678findExample <-
679 function(kco,
680 query,
681 vc = "",
682 matchOnly = TRUE) {
683 out <- character(length = length(query))
684
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200685 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100686 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200687 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100688
689 for (i in seq_along(query)) {
690 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100691 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200692 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100693 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200694 out[i] <- if (matchOnly) {
695 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100696 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200697 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100698 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100699 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200700 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100701 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100702 }
703 out
704 }
705
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200706collocatesQuery <-
707 function(kco,
708 query,
709 vc = "",
710 minOccur = 5,
711 leftContextSize = 5,
712 rightContextSize = 5,
713 searchHitsSampleLimit = 20000,
714 ignoreCollocateCase = FALSE,
715 stopwords = c(),
716 ...) {
717 frequency <- NULL
718 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200719 if (q@totalResults == 0) {
720 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200721 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200722 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
723 matches2FreqTable(q@collectedMatches,
724 0,
725 minOccur = minOccur,
726 leftContextSize = leftContextSize,
727 rightContextSize = rightContextSize,
728 ignoreCollocateCase = ignoreCollocateCase,
729 stopwords = stopwords,
730 ...,
731 verbose = kco@verbose
732 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100733 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200734 filter(frequency >= minOccur)
735 }
736 }