blob: 69d220816c3fc981446a4a93f41062919a92f243 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz9894a372025-10-18 14:51:29 +020045#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020049#' @return
50#' A tibble where each row represents a candidate collocate for the requested node.
51#' Columns include (depending on the selected association measures):
52#'
53#' \itemize{
54#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
55#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
56#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
57#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
58#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
59#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
60#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
61#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
62#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020063#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020065#' @importFrom tidyr expand_grid pivot_wider
66#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067#'
68#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020069#' \dontrun{
70#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020071#' # Find top collocates of "Packung" inside and outside the sports domain.
72#' KorAPConnection(verbose = TRUE) |>
73#' collocationAnalysis("Packung",
74#' vc = c("textClass=sport", "textClass!=sport"),
75#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
76#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020077#' dplyr::filter(logDice >= 5)
78#' }
79#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020080#' \dontrun{
81#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020082#' # Identify the most prominent light verb construction with "in ... setzen".
83#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010084#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020085#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020086#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
87#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020088#' }
89#'
90#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020091setMethod(
92 "collocationAnalysis", "KorAPConnection",
93 function(kco,
94 node,
95 vc = "",
96 lemmatizeNodeQuery = FALSE,
97 minOccur = 5,
98 leftContextSize = 5,
99 rightContextSize = 5,
100 topCollocatesLimit = 200,
101 searchHitsSampleLimit = 20000,
102 ignoreCollocateCase = FALSE,
103 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
104 exactFrequencies = TRUE,
105 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
106 seed = 7,
107 expand = length(vc) != length(node),
108 maxRecurse = 0,
109 addExamples = FALSE,
110 thresholdScore = "logDice",
111 threshold = 2.0,
112 localStopwords = c(),
113 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz9894a372025-10-18 14:51:29 +0200114 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200115 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200116 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200117 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200118
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200120 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
121 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200122
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200123 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 if (lemmatizeNodeQuery) {
126 node <- lemmatizeWordQuery(node)
127 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200128
Marc Kupietze34a8be2025-10-17 20:13:42 +0200129 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200130 if (is.null(vcNames)) {
131 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 }
133
134 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200135 if (!is.null(names(vc)) && length(vc) > 0) {
136 raw_names <- names(vc)
137 if (any(!is.na(raw_names) & raw_names != "")) {
138 label_lookup <- stats::setNames(raw_names, vc)
139 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200140 }
141
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200142 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200143 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200144 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
145 tmp_grid$vc <- vc[tmp_grid$idx]
146 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
147 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200148 } else {
149 tibble(node = node, vc = vc, vcLabel = vcNames)
150 }
151
152 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200153 collocationAnalysis(kco,
154 node = node,
155 vc = vc,
156 minOccur = minOccur,
157 leftContextSize = leftContextSize,
158 rightContextSize = rightContextSize,
159 topCollocatesLimit = topCollocatesLimit,
160 searchHitsSampleLimit = searchHitsSampleLimit,
161 ignoreCollocateCase = ignoreCollocateCase,
162 withinSpan = withinSpan,
163 exactFrequencies = exactFrequencies,
164 stopwords = stopwords,
165 addExamples = TRUE,
166 localStopwords = localStopwords,
167 seed = seed,
168 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200169 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200170 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200171 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200172 ...
173 )
174 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200175 bind_rows()
176
177 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
178 multi_result
179 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200180 if (!"label" %in% names(multi_result)) {
181 multi_result$label <- NA_character_
182 }
183
184 if (!is.null(label_lookup)) {
185 override <- unname(label_lookup[multi_result$vc])
186 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
187 if (any(missing_idx)) {
188 multi_result$label[missing_idx] <- override[missing_idx]
189 }
190 }
191
192 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
193 if (any(missing_idx)) {
194 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
195 }
196
Marc Kupietze31322e2025-10-17 18:55:36 +0200197 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200198 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200199 missingScoreQuantile = missingScoreQuantile
200 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200201 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200202 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200203 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
204 vcLabel <- vcNames[1]
205 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200206
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200207 set.seed(seed)
208 candidates <- collocatesQuery(
209 kco,
210 node,
211 vc = vc,
212 minOccur = minOccur,
213 leftContextSize = leftContextSize,
214 rightContextSize = rightContextSize,
215 searchHitsSampleLimit = searchHitsSampleLimit,
216 ignoreCollocateCase = ignoreCollocateCase,
217 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200218 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200219 ...
220 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200221
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200222 if (nrow(candidates) > 0) {
223 candidates <- candidates |>
224 filter(frequency >= minOccur) |>
225 slice_head(n = topCollocatesLimit)
226 collocationScoreQuery(
227 kco,
228 node = node,
229 collocate = candidates$word,
230 vc = vc,
231 leftContextSize = leftContextSize,
232 rightContextSize = rightContextSize,
233 observed = if (exactFrequencies) NA else candidates$frequency,
234 ignoreCollocateCase = ignoreCollocateCase,
235 withinSpan = withinSpan,
236 ...
237 ) |>
238 filter(O >= minOccur) |>
239 dplyr::arrange(dplyr::desc(logDice))
240 } else {
241 tibble()
242 }
243 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200244
245 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
246 result$label <- rep(vcLabel, nrow(result))
247 }
248
249 threshold_col <- thresholdScore
250 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
251 threshold_values <- result[[threshold_col]]
252 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
253 if (length(eligible_idx) > 0) {
254 recurseWith <- result[eligible_idx, , drop = FALSE]
255 result <- collocationAnalysis(
256 kco,
257 node = paste0("(", buildCollocationQuery(
258 removeWithinSpan(recurseWith$node, withinSpan),
259 recurseWith$collocate,
260 leftContextSize = leftContextSize,
261 rightContextSize = rightContextSize,
262 withinSpan = ""
263 ), ")"),
264 vc = vc,
265 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200266 leftContextSize = leftContextSize,
267 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200268 withinSpan = withinSpan,
269 maxRecurse = maxRecurse - 1,
270 stopwords = stopwords,
271 localStopwords = recurseWith$collocate,
272 exactFrequencies = exactFrequencies,
273 searchHitsSampleLimit = searchHitsSampleLimit,
274 topCollocatesLimit = topCollocatesLimit,
275 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200276 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200277 collocateFilterRegex = collocateFilterRegex,
278 vcLabel = vcLabel
279 ) |>
280 bind_rows(result) |>
281 filter(logDice >= 2) |>
282 filter(O >= minOccur) |>
283 dplyr::arrange(dplyr::desc(logDice))
284 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200285 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200286
287 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200288 result$query <- buildCollocationQuery(
289 result$node,
290 result$collocate,
291 leftContextSize = leftContextSize,
292 rightContextSize = rightContextSize,
293 withinSpan = withinSpan
294 )
295 result$example <- findExample(
296 kco,
297 query = result$query,
298 vc = result$vc
299 )
300 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200301
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200302 result
303 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200304)
305
Marc Kupietz76b05592021-12-19 16:26:15 +0100306# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100307removeWithinSpan <- function(query, withinSpan) {
308 if (withinSpan == "") {
309 return(query)
310 }
311 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200312 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100313 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200314 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100315 return(res)
316}
317
Marc Kupietz77852b22025-10-19 11:35:34 +0200318add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200319 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200320
321 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
322 return(result)
323 }
324
325 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
326 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
327 score_cols <- setdiff(numeric_cols, non_score_cols)
328
329 if (length(score_cols) == 0) {
330 return(result)
331 }
332
Marc Kupietz9894a372025-10-18 14:51:29 +0200333 compute_score_floor <- function(values) {
334 finite_values <- values[is.finite(values)]
335 if (length(finite_values) == 0) {
336 return(0)
337 }
338
339 prob <- min(max(missingScoreQuantile, 0), 0.5)
340 q_val <- suppressWarnings(stats::quantile(finite_values,
341 probs = prob,
342 names = FALSE,
343 type = 7
344 ))
345
346 if (!is.finite(q_val)) {
347 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
348 }
349
350 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
351 if (!is.finite(min_val)) {
352 min_val <- 0
353 }
354
355 spread_candidates <- c(
356 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
357 stats::sd(finite_values, na.rm = TRUE),
358 abs(q_val) * 0.1,
359 abs(min_val - q_val)
360 )
361 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
362
363 spread <- 0
364 if (length(spread_candidates) > 0) {
365 spread <- max(spread_candidates)
366 }
367 if (!is.finite(spread) || spread == 0) {
368 spread <- max(abs(q_val), abs(min_val), 1e-06)
369 }
370
371 candidate <- q_val - spread
372 if (!is.finite(candidate)) {
373 candidate <- min_val
374 }
375
376 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
377 if (!is.finite(floor_value)) {
378 floor_value <- min_val
379 }
380 if (!is.finite(floor_value)) {
381 floor_value <- 0
382 }
383
384 floor_value
385 }
386
387 score_replacements <- stats::setNames(
388 vapply(score_cols, function(col) {
389 compute_score_floor(result[[col]])
390 }, numeric(1)),
391 score_cols
392 )
393
Marc Kupietzc4540a22025-10-14 17:39:53 +0200394 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200395 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
396 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200397 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200398 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200399 names_glue = "{.value}_{make.names(label)}",
400 values_fn = dplyr::first
401 )
402
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200403 raw_labels <- unique(result$label)
404 labels <- make.names(raw_labels)
405 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200406
Marc Kupietz28a29842025-10-18 12:25:09 +0200407 rank_data <- result |>
408 dplyr::distinct(node, collocate)
409
410 for (i in seq_along(raw_labels)) {
411 raw_lab <- raw_labels[i]
412 safe_lab <- labels[i]
413 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
414 if (nrow(label_df) == 0) {
415 next
416 }
417 label_df <- dplyr::distinct(label_df)
418 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
419 for (col in score_cols) {
420 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200421 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200422 values <- label_df[[col]]
423 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200424 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200425 valid_idx <- which(!is.na(values))
426 if (length(valid_idx) > 0) {
427 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200428 total <- length(valid_idx)
429 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200430 }
431 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200432 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200433 }
434 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
435 }
436
437 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
438
439 rank_replacements <- numeric(0)
440 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
441 if (length(rank_column_names) > 0) {
442 rank_replacements <- stats::setNames(
443 vapply(rank_column_names, function(col) {
444 col_values <- comparison[[col]]
445 valid_values <- col_values[!is.na(col_values)]
446 if (length(valid_values) == 0) {
447 nrow(comparison) + 1
448 } else {
449 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
450 }
451 }, numeric(1)),
452 rank_column_names
453 )
454 }
455
Marc Kupietz130a2a22025-10-18 16:09:23 +0200456 percentile_replacements <- numeric(0)
457 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
458 if (length(percentile_column_names) > 0) {
459 percentile_replacements <- stats::setNames(
460 rep(0, length(percentile_column_names)),
461 percentile_column_names
462 )
463 }
464
Marc Kupietz28a29842025-10-18 12:25:09 +0200465 collapse_label_values <- function(indices, safe_labels_vec) {
466 if (length(indices) == 0) {
467 return(NA_character_)
468 }
469 labs <- label_map[safe_labels_vec[indices]]
470 fallback <- safe_labels_vec[indices]
471 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
472 labs <- labs[!is.na(labs) & labs != ""]
473 if (length(labs) == 0) {
474 return(NA_character_)
475 }
476 paste(unique(labs), collapse = ", ")
477 }
478
Marc Kupietzc4540a22025-10-14 17:39:53 +0200479 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200480 fill_scores <- function(x, y, measure_col) {
481 replacement <- score_replacements[[measure_col]]
482 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
483 if (!is.finite(fallback_min)) {
484 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200485 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200486 if (!is.null(replacement) && is.finite(replacement)) {
487 replacement <- min(replacement, fallback_min)
488 } else {
489 replacement <- fallback_min
490 }
491 if (!is.finite(replacement)) {
492 replacement <- 0
493 }
494 if (any(is.na(x))) {
495 x[is.na(x)] <- replacement
496 }
497 if (any(is.na(y))) {
498 y[is.na(y)] <- replacement
499 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200500 list(x = x, y = y)
501 }
502
Marc Kupietz130a2a22025-10-18 16:09:23 +0200503 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
504 replacement_left <- percentile_replacements[[left_pct_col]]
505 if (is.null(replacement_left) || !is.finite(replacement_left)) {
506 replacement_left <- 0
507 }
508 replacement_right <- percentile_replacements[[right_pct_col]]
509 if (is.null(replacement_right) || !is.finite(replacement_right)) {
510 replacement_right <- 0
511 }
512 if (any(is.na(x))) {
513 x[is.na(x)] <- replacement_left
514 }
515 if (any(is.na(y))) {
516 y[is.na(y)] <- replacement_right
517 }
518 list(x = x, y = y)
519 }
520
Marc Kupietz28a29842025-10-18 12:25:09 +0200521 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
522 fallback <- nrow(comparison) + 1
523 replacement_left <- rank_replacements[[left_rank_col]]
524 if (is.null(replacement_left) || !is.finite(replacement_left)) {
525 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200526 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200527 replacement_right <- rank_replacements[[right_rank_col]]
528 if (is.null(replacement_right) || !is.finite(replacement_right)) {
529 replacement_right <- fallback
530 }
531 if (any(is.na(x))) {
532 x[is.na(x)] <- replacement_left
533 }
534 if (any(is.na(y))) {
535 y[is.na(y)] <- replacement_right
536 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200537 list(x = x, y = y)
538 }
539
540 left_label <- labels[1]
541 right_label <- labels[2]
542
543 for (col in score_cols) {
544 left_col <- paste0(col, "_", left_label)
545 right_col <- paste0(col, "_", right_label)
546 if (!all(c(left_col, right_col) %in% names(comparison))) {
547 next
548 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200549 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200550 comparison[[left_col]] <- filled$x
551 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200552 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200553 rank_left <- paste0("rank_", left_label, "_", col)
554 rank_right <- paste0("rank_", right_label, "_", col)
555 if (all(c(rank_left, rank_right) %in% names(comparison))) {
556 filled_rank <- fill_ranks(
557 comparison[[rank_left]],
558 comparison[[rank_right]],
559 rank_left,
560 rank_right
561 )
562 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
563 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200564 pct_left <- paste0("percentile_rank_", left_label, "_", col)
565 pct_right <- paste0("percentile_rank_", right_label, "_", col)
566 if (all(c(pct_left, pct_right) %in% names(comparison))) {
567 filled_pct <- fill_percentiles(
568 comparison[[pct_left]],
569 comparison[[pct_right]],
570 pct_left,
571 pct_right
572 )
573 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
574 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200575 }
576 }
577
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200578 for (col in score_cols) {
579 value_cols <- paste0(col, "_", labels)
580 existing <- value_cols %in% names(comparison)
581 if (!any(existing)) {
582 next
583 }
584 value_cols <- value_cols[existing]
585 safe_labels <- labels[existing]
586
587 score_values <- comparison[, value_cols, drop = FALSE]
588
589 winner_label_col <- paste0("winner_", col)
590 winner_value_col <- paste0("winner_", col, "_value")
591 runner_label_col <- paste0("runner_up_", col)
592 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200593 loser_label_col <- paste0("loser_", col)
594 loser_value_col <- paste0("loser_", col, "_value")
595 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200596
597 if (nrow(score_values) == 0) {
598 comparison[[winner_label_col]] <- character(0)
599 comparison[[winner_value_col]] <- numeric(0)
600 comparison[[runner_label_col]] <- character(0)
601 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200602 comparison[[loser_label_col]] <- character(0)
603 comparison[[loser_value_col]] <- numeric(0)
604 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200605 next
606 }
607
608 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200609 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200610
Marc Kupietzb2862d42025-10-18 10:17:49 +0200611 n_rows <- nrow(score_matrix)
612 winner_labels <- rep(NA_character_, n_rows)
613 winner_values <- rep(NA_real_, n_rows)
614 runner_labels <- rep(NA_character_, n_rows)
615 runner_values <- rep(NA_real_, n_rows)
616 loser_labels <- rep(NA_character_, n_rows)
617 loser_values <- rep(NA_real_, n_rows)
618 max_deltas <- rep(NA_real_, n_rows)
619
Marc Kupietzb2862d42025-10-18 10:17:49 +0200620 if (n_rows > 0) {
621 for (i in seq_len(n_rows)) {
622 numeric_row <- as.numeric(score_matrix[i, ])
623 if (all(is.na(numeric_row))) {
624 next
625 }
626
Marc Kupietz9894a372025-10-18 14:51:29 +0200627 replacement <- score_replacements[[col]]
628 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
629 if (!is.finite(fallback_min)) {
630 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200631 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200632 if (!is.null(replacement) && is.finite(replacement)) {
633 replacement <- min(replacement, fallback_min)
634 } else {
635 replacement <- fallback_min
636 }
637 if (!is.finite(replacement)) {
638 replacement <- 0
639 }
640 if (any(is.na(numeric_row))) {
641 numeric_row[is.na(numeric_row)] <- replacement
642 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200643 score_matrix[i, ] <- numeric_row
644
645 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
646 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200647 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200648 winner_values[i] <- max_val
649
650 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
651 if (length(unique_vals) >= 2) {
652 runner_val <- unique_vals[2]
653 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200654 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200655 runner_values[i] <- runner_val
656 }
657
658 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
659 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200660 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200661 loser_values[i] <- min_val
662
663 if (is.finite(max_val) && is.finite(min_val)) {
664 max_deltas[i] <- max_val - min_val
665 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200666 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200667 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200668
Marc Kupietzb2862d42025-10-18 10:17:49 +0200669 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200670 comparison[[winner_label_col]] <- winner_labels
671 comparison[[winner_value_col]] <- winner_values
672 comparison[[runner_label_col]] <- runner_labels
673 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200674 comparison[[loser_label_col]] <- loser_labels
675 comparison[[loser_value_col]] <- loser_values
676 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200677 }
678
Marc Kupietz28a29842025-10-18 12:25:09 +0200679 for (col in score_cols) {
680 rank_cols <- paste0("rank_", labels, "_", col)
681 existing <- rank_cols %in% names(comparison)
682 if (!any(existing)) {
683 next
684 }
685 rank_cols <- rank_cols[existing]
686 safe_labels <- labels[existing]
687 rank_values <- comparison[, rank_cols, drop = FALSE]
688
689 winner_rank_label_col <- paste0("winner_rank_", col)
690 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
691 runner_rank_label_col <- paste0("runner_up_rank_", col)
692 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
693 loser_rank_label_col <- paste0("loser_rank_", col)
694 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
695 max_delta_rank_col <- paste0("max_delta_rank_", col)
696
697 if (nrow(rank_values) == 0) {
698 comparison[[winner_rank_label_col]] <- character(0)
699 comparison[[winner_rank_value_col]] <- numeric(0)
700 comparison[[runner_rank_label_col]] <- character(0)
701 comparison[[runner_rank_value_col]] <- numeric(0)
702 comparison[[loser_rank_label_col]] <- character(0)
703 comparison[[loser_rank_value_col]] <- numeric(0)
704 comparison[[max_delta_rank_col]] <- numeric(0)
705 next
706 }
707
708 rank_matrix <- as.matrix(rank_values)
709 storage.mode(rank_matrix) <- "numeric"
710
711 n_rows <- nrow(rank_matrix)
712 winner_labels <- rep(NA_character_, n_rows)
713 winner_values <- rep(NA_real_, n_rows)
714 runner_labels <- rep(NA_character_, n_rows)
715 runner_values <- rep(NA_real_, n_rows)
716 loser_labels <- rep(NA_character_, n_rows)
717 loser_values <- rep(NA_real_, n_rows)
718 max_deltas <- rep(NA_real_, n_rows)
719
720 for (i in seq_len(n_rows)) {
721 numeric_row <- as.numeric(rank_matrix[i, ])
722 if (all(is.na(numeric_row))) {
723 next
724 }
725
726 if (length(rank_cols) > 0) {
727 replacement_vec <- rank_replacements[rank_cols]
728 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
729 missing_idx <- which(is.na(numeric_row))
730 if (length(missing_idx) > 0) {
731 numeric_row[missing_idx] <- replacement_vec[missing_idx]
732 }
733 }
734
735 valid_idx <- seq_along(numeric_row)
736 valid_values <- numeric_row[valid_idx]
737 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
738 min_positions <- valid_idx[which(valid_values == min_val)]
739 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
740 winner_values[i] <- min_val
741
742 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
743 if (length(ordered_vals) >= 2) {
744 runner_val <- ordered_vals[2]
745 runner_positions <- valid_idx[which(valid_values == runner_val)]
746 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
747 runner_values[i] <- runner_val
748 }
749
750 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
751 max_positions <- valid_idx[which(valid_values == max_val)]
752 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
753 loser_values[i] <- max_val
754
755 if (is.finite(max_val) && is.finite(min_val)) {
756 max_deltas[i] <- max_val - min_val
757 }
758 }
759
760 comparison[[winner_rank_label_col]] <- winner_labels
761 comparison[[winner_rank_value_col]] <- winner_values
762 comparison[[runner_rank_label_col]] <- runner_labels
763 comparison[[runner_rank_value_col]] <- runner_values
764 comparison[[loser_rank_label_col]] <- loser_labels
765 comparison[[loser_rank_value_col]] <- loser_values
766 comparison[[max_delta_rank_col]] <- max_deltas
767 }
768
Marc Kupietz130a2a22025-10-18 16:09:23 +0200769 for (col in score_cols) {
770 pct_cols <- paste0("percentile_rank_", labels, "_", col)
771 existing <- pct_cols %in% names(comparison)
772 if (!any(existing)) {
773 next
774 }
775 pct_cols <- pct_cols[existing]
776 safe_labels <- labels[existing]
777 pct_values <- comparison[, pct_cols, drop = FALSE]
778
779 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
780 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
781 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
782 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
783 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
784 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
785 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
786
787 if (nrow(pct_values) == 0) {
788 comparison[[winner_pct_label_col]] <- character(0)
789 comparison[[winner_pct_value_col]] <- numeric(0)
790 comparison[[runner_pct_label_col]] <- character(0)
791 comparison[[runner_pct_value_col]] <- numeric(0)
792 comparison[[loser_pct_label_col]] <- character(0)
793 comparison[[loser_pct_value_col]] <- numeric(0)
794 comparison[[max_delta_pct_col]] <- numeric(0)
795 next
796 }
797
798 pct_matrix <- as.matrix(pct_values)
799 storage.mode(pct_matrix) <- "numeric"
800
801 n_rows <- nrow(pct_matrix)
802 winner_labels <- rep(NA_character_, n_rows)
803 winner_values <- rep(NA_real_, n_rows)
804 runner_labels <- rep(NA_character_, n_rows)
805 runner_values <- rep(NA_real_, n_rows)
806 loser_labels <- rep(NA_character_, n_rows)
807 loser_values <- rep(NA_real_, n_rows)
808 max_deltas <- rep(NA_real_, n_rows)
809
810 if (n_rows > 0) {
811 for (i in seq_len(n_rows)) {
812 numeric_row <- as.numeric(pct_matrix[i, ])
813 if (all(is.na(numeric_row))) {
814 next
815 }
816
817 if (any(is.na(numeric_row))) {
818 numeric_row[is.na(numeric_row)] <- 0
819 }
820 pct_matrix[i, ] <- numeric_row
821
822 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
823 max_idx <- which(numeric_row == max_val)
824 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
825 winner_values[i] <- max_val
826
827 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
828 if (length(unique_vals) >= 2) {
829 runner_val <- unique_vals[2]
830 runner_idx <- which(numeric_row == runner_val)
831 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
832 runner_values[i] <- runner_val
833 }
834
835 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
836 min_idx <- which(numeric_row == min_val)
837 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
838 loser_values[i] <- min_val
839
840 if (is.finite(max_val) && is.finite(min_val)) {
841 max_deltas[i] <- max_val - min_val
842 }
843 }
844 }
845
846 comparison[, pct_cols] <- pct_matrix
847 comparison[[winner_pct_label_col]] <- winner_labels
848 comparison[[winner_pct_value_col]] <- winner_values
849 comparison[[runner_pct_label_col]] <- runner_labels
850 comparison[[runner_pct_value_col]] <- runner_values
851 comparison[[loser_pct_label_col]] <- loser_labels
852 comparison[[loser_pct_value_col]] <- loser_values
853 comparison[[max_delta_pct_col]] <- max_deltas
854 }
855
Marc Kupietzc4540a22025-10-14 17:39:53 +0200856 dplyr::left_join(result, comparison, by = c("node", "collocate"))
857}
858
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200859#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200860#' @importFrom stringr str_detect
861#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
862#'
863matches2FreqTable <- function(matches,
864 index = 0,
865 minOccur = 5,
866 leftContextSize = 5,
867 rightContextSize = 5,
868 ignoreCollocateCase = FALSE,
869 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200870 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200871 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
872 verbose = TRUE) {
873 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
874 frequency <- NULL
875
876 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200877 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200878 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200879 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200880 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200881 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
882 ignoreCollocateCase = ignoreCollocateCase,
883 stopwords = stopwords, oldTable = oldTable, verbose = verbose
884 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200885 }
886 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200887 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200888 oldTable <- matches2FreqTable(
889 matches,
890 i,
891 leftContextSize = leftContextSize,
892 rightContextSize = rightContextSize,
893 collocateFilterRegex = collocateFilterRegex,
894 oldTable = oldTable,
895 stopwords = stopwords
896 )
897 }
898 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200899 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100900 group_by(word) |>
901 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200902 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200903 arrange(desc(frequency))
904 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200905 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200906
907 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
908
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200909 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200910
911 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
912
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200913 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200914
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200915 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200916 oldTable
917 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100918 table(c(left, right)) |>
919 dplyr::as_tibble(.name_repair = "minimal") |>
920 dplyr::rename(word = 1, frequency = 2) |>
921 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200922 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200923 dplyr::bind_rows(oldTable)
924 }
925 }
926}
927
928#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200929#' @importFrom stringr str_match str_split str_detect
930#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
931#'
932snippet2FreqTable <- function(snippet,
933 minOccur = 5,
934 leftContextSize = 5,
935 rightContextSize = 5,
936 ignoreCollocateCase = FALSE,
937 stopwords = c(),
938 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200939 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200940 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
941 verbose = TRUE) {
942 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
943 frequency <- NULL
944
945 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200946 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200947 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200948 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200949 for (s in snippet) {
950 oldTable <- snippet2FreqTable(
951 s,
952 leftContextSize = leftContextSize,
953 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100954 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200955 oldTable = oldTable,
956 stopwords = stopwords
957 )
958 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200959 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200960 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100961 group_by(word) |>
962 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200963 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200964 arrange(desc(frequency))
965 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200966 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200967 match <-
968 str_match(
969 snippet,
970 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
971 )
972
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200973 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200974 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200975 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200976 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200977 }
978 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200979
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200980 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200981 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200982 } else {
983 ""
984 }
985 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200986
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200987 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200988 oldTable
989 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100990 table(c(left, right)) |>
991 dplyr::as_tibble(.name_repair = "minimal") |>
992 dplyr::rename(word = 1, frequency = 2) |>
993 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200994 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200995 dplyr::bind_rows(oldTable)
996 }
997 }
998}
999
1000#' Preliminary synsemantic stopwords function
1001#'
1002#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001003#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001004#'
1005#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1006#'
1007#' @details
1008#' Currently only suitable for German. See stopwords package for other languages.
1009#'
1010#' @param ... future arguments for language detection
1011#'
1012#' @family collocation analysis functions
1013#' @return Vector of synsemantic stopwords.
1014#' @export
1015synsemanticStopwords <- function(...) {
1016 res <- c(
1017 "der",
1018 "die",
1019 "und",
1020 "in",
1021 "den",
1022 "von",
1023 "mit",
1024 "das",
1025 "zu",
1026 "im",
1027 "ist",
1028 "auf",
1029 "sich",
1030 "Die",
1031 "des",
1032 "dem",
1033 "nicht",
1034 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001035 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001036 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001037 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001038 "es",
1039 "auch",
1040 "an",
1041 "als",
1042 "am",
1043 "aus",
1044 "Der",
1045 "bei",
1046 "er",
1047 "dass",
1048 "sie",
1049 "nach",
1050 "um",
1051 "Das",
1052 "zum",
1053 "noch",
1054 "war",
1055 "einen",
1056 "einer",
1057 "wie",
1058 "einem",
1059 "vor",
1060 "bis",
1061 "\u00fcber",
1062 "so",
1063 "aber",
1064 "Eine",
1065 "diese",
1066 "Diese",
Marc Kupietz130a2a22025-10-18 16:09:23 +02001067 "oder",
1068 "Es",
1069 "Und"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001070 )
1071 return(res)
1072}
1073
Marc Kupietz5a336b62021-11-27 17:51:35 +01001074
Marc Kupietz76b05592021-12-19 16:26:15 +01001075# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001076findExample <-
1077 function(kco,
1078 query,
1079 vc = "",
1080 matchOnly = TRUE) {
1081 out <- character(length = length(query))
1082
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001083 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001084 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001085 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001086
1087 for (i in seq_along(query)) {
1088 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001089 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001090 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001091 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001092 out[i] <- if (matchOnly) {
1093 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001094 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001095 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001096 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001097 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001098 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001099 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001100 }
1101 out
1102 }
1103
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001104collocatesQuery <-
1105 function(kco,
1106 query,
1107 vc = "",
1108 minOccur = 5,
1109 leftContextSize = 5,
1110 rightContextSize = 5,
1111 searchHitsSampleLimit = 20000,
1112 ignoreCollocateCase = FALSE,
1113 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001114 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001115 ...) {
1116 frequency <- NULL
1117 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001118 if (q@totalResults == 0) {
1119 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001120 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001121 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1122 matches2FreqTable(q@collectedMatches,
1123 0,
1124 minOccur = minOccur,
1125 leftContextSize = leftContextSize,
1126 rightContextSize = rightContextSize,
1127 ignoreCollocateCase = ignoreCollocateCase,
1128 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001129 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001130 ...,
1131 verbose = kco@verbose
1132 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001133 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001134 filter(frequency >= minOccur)
1135 }
1136 }