blob: b7c5be6a43fe416b54d8a078e4c2ffb5e80b9f25 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz9894a372025-10-18 14:51:29 +020045#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020049#' @return
50#' A tibble where each row represents a candidate collocate for the requested node.
51#' Columns include (depending on the selected association measures):
52#'
53#' \itemize{
54#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
55#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
56#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
57#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
58#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
59#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
60#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
61#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
62#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020063#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020065#' @importFrom tidyr expand_grid pivot_wider
66#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067#'
68#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020069#' \dontrun{
70#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020071#' # Find top collocates of "Packung" inside and outside the sports domain.
72#' KorAPConnection(verbose = TRUE) |>
73#' collocationAnalysis("Packung",
74#' vc = c("textClass=sport", "textClass!=sport"),
75#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
76#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020077#' dplyr::filter(logDice >= 5)
78#' }
79#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020080#' \dontrun{
81#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020082#' # Identify the most prominent light verb construction with "in ... setzen".
83#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010084#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020085#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020086#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
87#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020088#' }
89#'
90#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020091setMethod(
92 "collocationAnalysis", "KorAPConnection",
93 function(kco,
94 node,
95 vc = "",
96 lemmatizeNodeQuery = FALSE,
97 minOccur = 5,
98 leftContextSize = 5,
99 rightContextSize = 5,
100 topCollocatesLimit = 200,
101 searchHitsSampleLimit = 20000,
102 ignoreCollocateCase = FALSE,
103 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
104 exactFrequencies = TRUE,
105 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
106 seed = 7,
107 expand = length(vc) != length(node),
108 maxRecurse = 0,
109 addExamples = FALSE,
110 thresholdScore = "logDice",
111 threshold = 2.0,
112 localStopwords = c(),
113 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz9894a372025-10-18 14:51:29 +0200114 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200115 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200116 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200117 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200118
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200120 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
121 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200122
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200123 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 if (lemmatizeNodeQuery) {
126 node <- lemmatizeWordQuery(node)
127 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200128
Marc Kupietze34a8be2025-10-17 20:13:42 +0200129 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200130 if (is.null(vcNames)) {
131 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 }
133
134 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200135 if (!is.null(names(vc)) && length(vc) > 0) {
136 raw_names <- names(vc)
137 if (any(!is.na(raw_names) & raw_names != "")) {
138 label_lookup <- stats::setNames(raw_names, vc)
139 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200140 }
141
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200142 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200143 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200144 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
145 tmp_grid$vc <- vc[tmp_grid$idx]
146 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
147 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200148 } else {
149 tibble(node = node, vc = vc, vcLabel = vcNames)
150 }
151
152 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200153 collocationAnalysis(kco,
154 node = node,
155 vc = vc,
156 minOccur = minOccur,
157 leftContextSize = leftContextSize,
158 rightContextSize = rightContextSize,
159 topCollocatesLimit = topCollocatesLimit,
160 searchHitsSampleLimit = searchHitsSampleLimit,
161 ignoreCollocateCase = ignoreCollocateCase,
162 withinSpan = withinSpan,
163 exactFrequencies = exactFrequencies,
164 stopwords = stopwords,
165 addExamples = TRUE,
166 localStopwords = localStopwords,
167 seed = seed,
168 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200169 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200170 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200171 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200172 ...
173 )
174 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200175 bind_rows()
176
177 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
178 multi_result
179 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200180 if (!"label" %in% names(multi_result)) {
181 multi_result$label <- NA_character_
182 }
183
184 if (!is.null(label_lookup)) {
185 override <- unname(label_lookup[multi_result$vc])
186 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
187 if (any(missing_idx)) {
188 multi_result$label[missing_idx] <- override[missing_idx]
189 }
190 }
191
192 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
193 if (any(missing_idx)) {
194 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
195 }
196
Marc Kupietze31322e2025-10-17 18:55:36 +0200197 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200198 add_multi_vc_comparisons(
199 thresholdScore = thresholdScore,
200 missingScoreQuantile = missingScoreQuantile
201 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200202 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200203 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200204 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
205 vcLabel <- vcNames[1]
206 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200207
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200208 set.seed(seed)
209 candidates <- collocatesQuery(
210 kco,
211 node,
212 vc = vc,
213 minOccur = minOccur,
214 leftContextSize = leftContextSize,
215 rightContextSize = rightContextSize,
216 searchHitsSampleLimit = searchHitsSampleLimit,
217 ignoreCollocateCase = ignoreCollocateCase,
218 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200219 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200220 ...
221 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200222
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200223 if (nrow(candidates) > 0) {
224 candidates <- candidates |>
225 filter(frequency >= minOccur) |>
226 slice_head(n = topCollocatesLimit)
227 collocationScoreQuery(
228 kco,
229 node = node,
230 collocate = candidates$word,
231 vc = vc,
232 leftContextSize = leftContextSize,
233 rightContextSize = rightContextSize,
234 observed = if (exactFrequencies) NA else candidates$frequency,
235 ignoreCollocateCase = ignoreCollocateCase,
236 withinSpan = withinSpan,
237 ...
238 ) |>
239 filter(O >= minOccur) |>
240 dplyr::arrange(dplyr::desc(logDice))
241 } else {
242 tibble()
243 }
244 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200245
246 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
247 result$label <- rep(vcLabel, nrow(result))
248 }
249
250 threshold_col <- thresholdScore
251 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
252 threshold_values <- result[[threshold_col]]
253 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
254 if (length(eligible_idx) > 0) {
255 recurseWith <- result[eligible_idx, , drop = FALSE]
256 result <- collocationAnalysis(
257 kco,
258 node = paste0("(", buildCollocationQuery(
259 removeWithinSpan(recurseWith$node, withinSpan),
260 recurseWith$collocate,
261 leftContextSize = leftContextSize,
262 rightContextSize = rightContextSize,
263 withinSpan = ""
264 ), ")"),
265 vc = vc,
266 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200267 leftContextSize = leftContextSize,
268 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200269 withinSpan = withinSpan,
270 maxRecurse = maxRecurse - 1,
271 stopwords = stopwords,
272 localStopwords = recurseWith$collocate,
273 exactFrequencies = exactFrequencies,
274 searchHitsSampleLimit = searchHitsSampleLimit,
275 topCollocatesLimit = topCollocatesLimit,
276 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200277 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200278 collocateFilterRegex = collocateFilterRegex,
279 vcLabel = vcLabel
280 ) |>
281 bind_rows(result) |>
282 filter(logDice >= 2) |>
283 filter(O >= minOccur) |>
284 dplyr::arrange(dplyr::desc(logDice))
285 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200286 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200287
288 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200289 result$query <- buildCollocationQuery(
290 result$node,
291 result$collocate,
292 leftContextSize = leftContextSize,
293 rightContextSize = rightContextSize,
294 withinSpan = withinSpan
295 )
296 result$example <- findExample(
297 kco,
298 query = result$query,
299 vc = result$vc
300 )
301 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200302
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200303 result
304 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200305)
306
Marc Kupietz76b05592021-12-19 16:26:15 +0100307# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100308removeWithinSpan <- function(query, withinSpan) {
309 if (withinSpan == "") {
310 return(query)
311 }
312 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200313 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100314 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200315 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100316 return(res)
317}
318
Marc Kupietz9894a372025-10-18 14:51:29 +0200319add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200320 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200321
322 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
323 return(result)
324 }
325
326 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
327 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
328 score_cols <- setdiff(numeric_cols, non_score_cols)
329
330 if (length(score_cols) == 0) {
331 return(result)
332 }
333
Marc Kupietz9894a372025-10-18 14:51:29 +0200334 compute_score_floor <- function(values) {
335 finite_values <- values[is.finite(values)]
336 if (length(finite_values) == 0) {
337 return(0)
338 }
339
340 prob <- min(max(missingScoreQuantile, 0), 0.5)
341 q_val <- suppressWarnings(stats::quantile(finite_values,
342 probs = prob,
343 names = FALSE,
344 type = 7
345 ))
346
347 if (!is.finite(q_val)) {
348 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
349 }
350
351 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
352 if (!is.finite(min_val)) {
353 min_val <- 0
354 }
355
356 spread_candidates <- c(
357 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
358 stats::sd(finite_values, na.rm = TRUE),
359 abs(q_val) * 0.1,
360 abs(min_val - q_val)
361 )
362 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
363
364 spread <- 0
365 if (length(spread_candidates) > 0) {
366 spread <- max(spread_candidates)
367 }
368 if (!is.finite(spread) || spread == 0) {
369 spread <- max(abs(q_val), abs(min_val), 1e-06)
370 }
371
372 candidate <- q_val - spread
373 if (!is.finite(candidate)) {
374 candidate <- min_val
375 }
376
377 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
378 if (!is.finite(floor_value)) {
379 floor_value <- min_val
380 }
381 if (!is.finite(floor_value)) {
382 floor_value <- 0
383 }
384
385 floor_value
386 }
387
388 score_replacements <- stats::setNames(
389 vapply(score_cols, function(col) {
390 compute_score_floor(result[[col]])
391 }, numeric(1)),
392 score_cols
393 )
394
Marc Kupietzc4540a22025-10-14 17:39:53 +0200395 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200396 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
397 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200398 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200399 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200400 names_glue = "{.value}_{make.names(label)}",
401 values_fn = dplyr::first
402 )
403
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200404 raw_labels <- unique(result$label)
405 labels <- make.names(raw_labels)
406 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200407
Marc Kupietz28a29842025-10-18 12:25:09 +0200408 rank_data <- result |>
409 dplyr::distinct(node, collocate)
410
411 for (i in seq_along(raw_labels)) {
412 raw_lab <- raw_labels[i]
413 safe_lab <- labels[i]
414 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
415 if (nrow(label_df) == 0) {
416 next
417 }
418 label_df <- dplyr::distinct(label_df)
419 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
420 for (col in score_cols) {
421 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200422 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200423 values <- label_df[[col]]
424 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200425 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200426 valid_idx <- which(!is.na(values))
427 if (length(valid_idx) > 0) {
428 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200429 total <- length(valid_idx)
430 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200431 }
432 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200433 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200434 }
435 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
436 }
437
438 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
439
440 rank_replacements <- numeric(0)
441 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
442 if (length(rank_column_names) > 0) {
443 rank_replacements <- stats::setNames(
444 vapply(rank_column_names, function(col) {
445 col_values <- comparison[[col]]
446 valid_values <- col_values[!is.na(col_values)]
447 if (length(valid_values) == 0) {
448 nrow(comparison) + 1
449 } else {
450 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
451 }
452 }, numeric(1)),
453 rank_column_names
454 )
455 }
456
Marc Kupietz130a2a22025-10-18 16:09:23 +0200457 percentile_replacements <- numeric(0)
458 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
459 if (length(percentile_column_names) > 0) {
460 percentile_replacements <- stats::setNames(
461 rep(0, length(percentile_column_names)),
462 percentile_column_names
463 )
464 }
465
Marc Kupietz28a29842025-10-18 12:25:09 +0200466 collapse_label_values <- function(indices, safe_labels_vec) {
467 if (length(indices) == 0) {
468 return(NA_character_)
469 }
470 labs <- label_map[safe_labels_vec[indices]]
471 fallback <- safe_labels_vec[indices]
472 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
473 labs <- labs[!is.na(labs) & labs != ""]
474 if (length(labs) == 0) {
475 return(NA_character_)
476 }
477 paste(unique(labs), collapse = ", ")
478 }
479
Marc Kupietzc4540a22025-10-14 17:39:53 +0200480 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200481 fill_scores <- function(x, y, measure_col) {
482 replacement <- score_replacements[[measure_col]]
483 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
484 if (!is.finite(fallback_min)) {
485 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200486 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200487 if (!is.null(replacement) && is.finite(replacement)) {
488 replacement <- min(replacement, fallback_min)
489 } else {
490 replacement <- fallback_min
491 }
492 if (!is.finite(replacement)) {
493 replacement <- 0
494 }
495 if (any(is.na(x))) {
496 x[is.na(x)] <- replacement
497 }
498 if (any(is.na(y))) {
499 y[is.na(y)] <- replacement
500 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200501 list(x = x, y = y)
502 }
503
Marc Kupietz130a2a22025-10-18 16:09:23 +0200504 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
505 replacement_left <- percentile_replacements[[left_pct_col]]
506 if (is.null(replacement_left) || !is.finite(replacement_left)) {
507 replacement_left <- 0
508 }
509 replacement_right <- percentile_replacements[[right_pct_col]]
510 if (is.null(replacement_right) || !is.finite(replacement_right)) {
511 replacement_right <- 0
512 }
513 if (any(is.na(x))) {
514 x[is.na(x)] <- replacement_left
515 }
516 if (any(is.na(y))) {
517 y[is.na(y)] <- replacement_right
518 }
519 list(x = x, y = y)
520 }
521
Marc Kupietz28a29842025-10-18 12:25:09 +0200522 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
523 fallback <- nrow(comparison) + 1
524 replacement_left <- rank_replacements[[left_rank_col]]
525 if (is.null(replacement_left) || !is.finite(replacement_left)) {
526 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200527 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200528 replacement_right <- rank_replacements[[right_rank_col]]
529 if (is.null(replacement_right) || !is.finite(replacement_right)) {
530 replacement_right <- fallback
531 }
532 if (any(is.na(x))) {
533 x[is.na(x)] <- replacement_left
534 }
535 if (any(is.na(y))) {
536 y[is.na(y)] <- replacement_right
537 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200538 list(x = x, y = y)
539 }
540
541 left_label <- labels[1]
542 right_label <- labels[2]
543
544 for (col in score_cols) {
545 left_col <- paste0(col, "_", left_label)
546 right_col <- paste0(col, "_", right_label)
547 if (!all(c(left_col, right_col) %in% names(comparison))) {
548 next
549 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200550 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200551 comparison[[left_col]] <- filled$x
552 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200553 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200554 rank_left <- paste0("rank_", left_label, "_", col)
555 rank_right <- paste0("rank_", right_label, "_", col)
556 if (all(c(rank_left, rank_right) %in% names(comparison))) {
557 filled_rank <- fill_ranks(
558 comparison[[rank_left]],
559 comparison[[rank_right]],
560 rank_left,
561 rank_right
562 )
563 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
564 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200565 pct_left <- paste0("percentile_rank_", left_label, "_", col)
566 pct_right <- paste0("percentile_rank_", right_label, "_", col)
567 if (all(c(pct_left, pct_right) %in% names(comparison))) {
568 filled_pct <- fill_percentiles(
569 comparison[[pct_left]],
570 comparison[[pct_right]],
571 pct_left,
572 pct_right
573 )
574 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
575 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200576 }
577 }
578
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200579 for (col in score_cols) {
580 value_cols <- paste0(col, "_", labels)
581 existing <- value_cols %in% names(comparison)
582 if (!any(existing)) {
583 next
584 }
585 value_cols <- value_cols[existing]
586 safe_labels <- labels[existing]
587
588 score_values <- comparison[, value_cols, drop = FALSE]
589
590 winner_label_col <- paste0("winner_", col)
591 winner_value_col <- paste0("winner_", col, "_value")
592 runner_label_col <- paste0("runner_up_", col)
593 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200594 loser_label_col <- paste0("loser_", col)
595 loser_value_col <- paste0("loser_", col, "_value")
596 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200597
598 if (nrow(score_values) == 0) {
599 comparison[[winner_label_col]] <- character(0)
600 comparison[[winner_value_col]] <- numeric(0)
601 comparison[[runner_label_col]] <- character(0)
602 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200603 comparison[[loser_label_col]] <- character(0)
604 comparison[[loser_value_col]] <- numeric(0)
605 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200606 next
607 }
608
609 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200610 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200611
Marc Kupietzb2862d42025-10-18 10:17:49 +0200612 n_rows <- nrow(score_matrix)
613 winner_labels <- rep(NA_character_, n_rows)
614 winner_values <- rep(NA_real_, n_rows)
615 runner_labels <- rep(NA_character_, n_rows)
616 runner_values <- rep(NA_real_, n_rows)
617 loser_labels <- rep(NA_character_, n_rows)
618 loser_values <- rep(NA_real_, n_rows)
619 max_deltas <- rep(NA_real_, n_rows)
620
Marc Kupietzb2862d42025-10-18 10:17:49 +0200621 if (n_rows > 0) {
622 for (i in seq_len(n_rows)) {
623 numeric_row <- as.numeric(score_matrix[i, ])
624 if (all(is.na(numeric_row))) {
625 next
626 }
627
Marc Kupietz9894a372025-10-18 14:51:29 +0200628 replacement <- score_replacements[[col]]
629 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
630 if (!is.finite(fallback_min)) {
631 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200632 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200633 if (!is.null(replacement) && is.finite(replacement)) {
634 replacement <- min(replacement, fallback_min)
635 } else {
636 replacement <- fallback_min
637 }
638 if (!is.finite(replacement)) {
639 replacement <- 0
640 }
641 if (any(is.na(numeric_row))) {
642 numeric_row[is.na(numeric_row)] <- replacement
643 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200644 score_matrix[i, ] <- numeric_row
645
646 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
647 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200648 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200649 winner_values[i] <- max_val
650
651 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
652 if (length(unique_vals) >= 2) {
653 runner_val <- unique_vals[2]
654 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200655 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200656 runner_values[i] <- runner_val
657 }
658
659 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
660 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200661 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200662 loser_values[i] <- min_val
663
664 if (is.finite(max_val) && is.finite(min_val)) {
665 max_deltas[i] <- max_val - min_val
666 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200667 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200668 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200669
Marc Kupietzb2862d42025-10-18 10:17:49 +0200670 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200671 comparison[[winner_label_col]] <- winner_labels
672 comparison[[winner_value_col]] <- winner_values
673 comparison[[runner_label_col]] <- runner_labels
674 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200675 comparison[[loser_label_col]] <- loser_labels
676 comparison[[loser_value_col]] <- loser_values
677 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200678 }
679
Marc Kupietz28a29842025-10-18 12:25:09 +0200680 for (col in score_cols) {
681 rank_cols <- paste0("rank_", labels, "_", col)
682 existing <- rank_cols %in% names(comparison)
683 if (!any(existing)) {
684 next
685 }
686 rank_cols <- rank_cols[existing]
687 safe_labels <- labels[existing]
688 rank_values <- comparison[, rank_cols, drop = FALSE]
689
690 winner_rank_label_col <- paste0("winner_rank_", col)
691 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
692 runner_rank_label_col <- paste0("runner_up_rank_", col)
693 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
694 loser_rank_label_col <- paste0("loser_rank_", col)
695 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
696 max_delta_rank_col <- paste0("max_delta_rank_", col)
697
698 if (nrow(rank_values) == 0) {
699 comparison[[winner_rank_label_col]] <- character(0)
700 comparison[[winner_rank_value_col]] <- numeric(0)
701 comparison[[runner_rank_label_col]] <- character(0)
702 comparison[[runner_rank_value_col]] <- numeric(0)
703 comparison[[loser_rank_label_col]] <- character(0)
704 comparison[[loser_rank_value_col]] <- numeric(0)
705 comparison[[max_delta_rank_col]] <- numeric(0)
706 next
707 }
708
709 rank_matrix <- as.matrix(rank_values)
710 storage.mode(rank_matrix) <- "numeric"
711
712 n_rows <- nrow(rank_matrix)
713 winner_labels <- rep(NA_character_, n_rows)
714 winner_values <- rep(NA_real_, n_rows)
715 runner_labels <- rep(NA_character_, n_rows)
716 runner_values <- rep(NA_real_, n_rows)
717 loser_labels <- rep(NA_character_, n_rows)
718 loser_values <- rep(NA_real_, n_rows)
719 max_deltas <- rep(NA_real_, n_rows)
720
721 for (i in seq_len(n_rows)) {
722 numeric_row <- as.numeric(rank_matrix[i, ])
723 if (all(is.na(numeric_row))) {
724 next
725 }
726
727 if (length(rank_cols) > 0) {
728 replacement_vec <- rank_replacements[rank_cols]
729 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
730 missing_idx <- which(is.na(numeric_row))
731 if (length(missing_idx) > 0) {
732 numeric_row[missing_idx] <- replacement_vec[missing_idx]
733 }
734 }
735
736 valid_idx <- seq_along(numeric_row)
737 valid_values <- numeric_row[valid_idx]
738 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
739 min_positions <- valid_idx[which(valid_values == min_val)]
740 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
741 winner_values[i] <- min_val
742
743 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
744 if (length(ordered_vals) >= 2) {
745 runner_val <- ordered_vals[2]
746 runner_positions <- valid_idx[which(valid_values == runner_val)]
747 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
748 runner_values[i] <- runner_val
749 }
750
751 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
752 max_positions <- valid_idx[which(valid_values == max_val)]
753 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
754 loser_values[i] <- max_val
755
756 if (is.finite(max_val) && is.finite(min_val)) {
757 max_deltas[i] <- max_val - min_val
758 }
759 }
760
761 comparison[[winner_rank_label_col]] <- winner_labels
762 comparison[[winner_rank_value_col]] <- winner_values
763 comparison[[runner_rank_label_col]] <- runner_labels
764 comparison[[runner_rank_value_col]] <- runner_values
765 comparison[[loser_rank_label_col]] <- loser_labels
766 comparison[[loser_rank_value_col]] <- loser_values
767 comparison[[max_delta_rank_col]] <- max_deltas
768 }
769
Marc Kupietz130a2a22025-10-18 16:09:23 +0200770 for (col in score_cols) {
771 pct_cols <- paste0("percentile_rank_", labels, "_", col)
772 existing <- pct_cols %in% names(comparison)
773 if (!any(existing)) {
774 next
775 }
776 pct_cols <- pct_cols[existing]
777 safe_labels <- labels[existing]
778 pct_values <- comparison[, pct_cols, drop = FALSE]
779
780 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
781 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
782 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
783 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
784 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
785 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
786 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
787
788 if (nrow(pct_values) == 0) {
789 comparison[[winner_pct_label_col]] <- character(0)
790 comparison[[winner_pct_value_col]] <- numeric(0)
791 comparison[[runner_pct_label_col]] <- character(0)
792 comparison[[runner_pct_value_col]] <- numeric(0)
793 comparison[[loser_pct_label_col]] <- character(0)
794 comparison[[loser_pct_value_col]] <- numeric(0)
795 comparison[[max_delta_pct_col]] <- numeric(0)
796 next
797 }
798
799 pct_matrix <- as.matrix(pct_values)
800 storage.mode(pct_matrix) <- "numeric"
801
802 n_rows <- nrow(pct_matrix)
803 winner_labels <- rep(NA_character_, n_rows)
804 winner_values <- rep(NA_real_, n_rows)
805 runner_labels <- rep(NA_character_, n_rows)
806 runner_values <- rep(NA_real_, n_rows)
807 loser_labels <- rep(NA_character_, n_rows)
808 loser_values <- rep(NA_real_, n_rows)
809 max_deltas <- rep(NA_real_, n_rows)
810
811 if (n_rows > 0) {
812 for (i in seq_len(n_rows)) {
813 numeric_row <- as.numeric(pct_matrix[i, ])
814 if (all(is.na(numeric_row))) {
815 next
816 }
817
818 if (any(is.na(numeric_row))) {
819 numeric_row[is.na(numeric_row)] <- 0
820 }
821 pct_matrix[i, ] <- numeric_row
822
823 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
824 max_idx <- which(numeric_row == max_val)
825 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
826 winner_values[i] <- max_val
827
828 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
829 if (length(unique_vals) >= 2) {
830 runner_val <- unique_vals[2]
831 runner_idx <- which(numeric_row == runner_val)
832 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
833 runner_values[i] <- runner_val
834 }
835
836 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
837 min_idx <- which(numeric_row == min_val)
838 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
839 loser_values[i] <- min_val
840
841 if (is.finite(max_val) && is.finite(min_val)) {
842 max_deltas[i] <- max_val - min_val
843 }
844 }
845 }
846
847 comparison[, pct_cols] <- pct_matrix
848 comparison[[winner_pct_label_col]] <- winner_labels
849 comparison[[winner_pct_value_col]] <- winner_values
850 comparison[[runner_pct_label_col]] <- runner_labels
851 comparison[[runner_pct_value_col]] <- runner_values
852 comparison[[loser_pct_label_col]] <- loser_labels
853 comparison[[loser_pct_value_col]] <- loser_values
854 comparison[[max_delta_pct_col]] <- max_deltas
855 }
856
Marc Kupietzc4540a22025-10-14 17:39:53 +0200857 dplyr::left_join(result, comparison, by = c("node", "collocate"))
858}
859
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200860#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200861#' @importFrom stringr str_detect
862#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
863#'
864matches2FreqTable <- function(matches,
865 index = 0,
866 minOccur = 5,
867 leftContextSize = 5,
868 rightContextSize = 5,
869 ignoreCollocateCase = FALSE,
870 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200871 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200872 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
873 verbose = TRUE) {
874 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
875 frequency <- NULL
876
877 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200878 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200879 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200880 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200881 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200882 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
883 ignoreCollocateCase = ignoreCollocateCase,
884 stopwords = stopwords, oldTable = oldTable, verbose = verbose
885 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200886 }
887 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200888 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200889 oldTable <- matches2FreqTable(
890 matches,
891 i,
892 leftContextSize = leftContextSize,
893 rightContextSize = rightContextSize,
894 collocateFilterRegex = collocateFilterRegex,
895 oldTable = oldTable,
896 stopwords = stopwords
897 )
898 }
899 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200900 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100901 group_by(word) |>
902 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200903 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200904 arrange(desc(frequency))
905 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200906 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200907
908 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
909
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200910 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200911
912 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
913
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200914 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200915
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200916 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200917 oldTable
918 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100919 table(c(left, right)) |>
920 dplyr::as_tibble(.name_repair = "minimal") |>
921 dplyr::rename(word = 1, frequency = 2) |>
922 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200923 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200924 dplyr::bind_rows(oldTable)
925 }
926 }
927}
928
929#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200930#' @importFrom stringr str_match str_split str_detect
931#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
932#'
933snippet2FreqTable <- function(snippet,
934 minOccur = 5,
935 leftContextSize = 5,
936 rightContextSize = 5,
937 ignoreCollocateCase = FALSE,
938 stopwords = c(),
939 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200940 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200941 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
942 verbose = TRUE) {
943 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
944 frequency <- NULL
945
946 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200947 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200948 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200949 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200950 for (s in snippet) {
951 oldTable <- snippet2FreqTable(
952 s,
953 leftContextSize = leftContextSize,
954 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100955 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200956 oldTable = oldTable,
957 stopwords = stopwords
958 )
959 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200960 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200961 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100962 group_by(word) |>
963 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200964 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200965 arrange(desc(frequency))
966 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200967 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200968 match <-
969 str_match(
970 snippet,
971 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
972 )
973
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200974 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200975 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200976 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200977 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200978 }
979 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200980
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200981 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200982 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200983 } else {
984 ""
985 }
986 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200987
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200988 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200989 oldTable
990 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100991 table(c(left, right)) |>
992 dplyr::as_tibble(.name_repair = "minimal") |>
993 dplyr::rename(word = 1, frequency = 2) |>
994 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200995 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200996 dplyr::bind_rows(oldTable)
997 }
998 }
999}
1000
1001#' Preliminary synsemantic stopwords function
1002#'
1003#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001004#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001005#'
1006#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1007#'
1008#' @details
1009#' Currently only suitable for German. See stopwords package for other languages.
1010#'
1011#' @param ... future arguments for language detection
1012#'
1013#' @family collocation analysis functions
1014#' @return Vector of synsemantic stopwords.
1015#' @export
1016synsemanticStopwords <- function(...) {
1017 res <- c(
1018 "der",
1019 "die",
1020 "und",
1021 "in",
1022 "den",
1023 "von",
1024 "mit",
1025 "das",
1026 "zu",
1027 "im",
1028 "ist",
1029 "auf",
1030 "sich",
1031 "Die",
1032 "des",
1033 "dem",
1034 "nicht",
1035 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001036 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001037 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001038 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001039 "es",
1040 "auch",
1041 "an",
1042 "als",
1043 "am",
1044 "aus",
1045 "Der",
1046 "bei",
1047 "er",
1048 "dass",
1049 "sie",
1050 "nach",
1051 "um",
1052 "Das",
1053 "zum",
1054 "noch",
1055 "war",
1056 "einen",
1057 "einer",
1058 "wie",
1059 "einem",
1060 "vor",
1061 "bis",
1062 "\u00fcber",
1063 "so",
1064 "aber",
1065 "Eine",
1066 "diese",
1067 "Diese",
Marc Kupietz130a2a22025-10-18 16:09:23 +02001068 "oder",
1069 "Es",
1070 "Und"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001071 )
1072 return(res)
1073}
1074
Marc Kupietz5a336b62021-11-27 17:51:35 +01001075
Marc Kupietz76b05592021-12-19 16:26:15 +01001076# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001077findExample <-
1078 function(kco,
1079 query,
1080 vc = "",
1081 matchOnly = TRUE) {
1082 out <- character(length = length(query))
1083
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001084 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001085 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001086 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001087
1088 for (i in seq_along(query)) {
1089 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001090 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001091 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001092 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001093 out[i] <- if (matchOnly) {
1094 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001095 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001096 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001097 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001098 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001099 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001100 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001101 }
1102 out
1103 }
1104
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001105collocatesQuery <-
1106 function(kco,
1107 query,
1108 vc = "",
1109 minOccur = 5,
1110 leftContextSize = 5,
1111 rightContextSize = 5,
1112 searchHitsSampleLimit = 20000,
1113 ignoreCollocateCase = FALSE,
1114 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001115 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001116 ...) {
1117 frequency <- NULL
1118 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001119 if (q@totalResults == 0) {
1120 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001121 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001122 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1123 matches2FreqTable(q@collectedMatches,
1124 0,
1125 minOccur = minOccur,
1126 leftContextSize = leftContextSize,
1127 rightContextSize = rightContextSize,
1128 ignoreCollocateCase = ignoreCollocateCase,
1129 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001130 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001131 ...,
1132 verbose = kco@verbose
1133 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001134 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001135 filter(frequency >= minOccur)
1136 }
1137 }