blob: f3731040b180a0e49fe180201bebb2fc85088930 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz4cbb5472025-10-19 12:15:25 +020045#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020049#' @return
50#' A tibble where each row represents a candidate collocate for the requested node.
51#' Columns include (depending on the selected association measures):
52#'
53#' \itemize{
54#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
55#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
56#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
57#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
58#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
59#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
60#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
61#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
62#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020063#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020065#' @importFrom tidyr expand_grid pivot_wider
66#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067#'
68#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020069#' \dontrun{
70#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020071#' # Find top collocates of "Packung" inside and outside the sports domain.
72#' KorAPConnection(verbose = TRUE) |>
73#' collocationAnalysis("Packung",
74#' vc = c("textClass=sport", "textClass!=sport"),
75#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
76#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020077#' dplyr::filter(logDice >= 5)
78#' }
79#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020080#' \dontrun{
81#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020082#' # Identify the most prominent light verb construction with "in ... setzen".
83#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010084#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020085#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020086#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
87#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020088#' }
89#'
90#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020091setMethod(
92 "collocationAnalysis", "KorAPConnection",
93 function(kco,
94 node,
95 vc = "",
96 lemmatizeNodeQuery = FALSE,
97 minOccur = 5,
98 leftContextSize = 5,
99 rightContextSize = 5,
100 topCollocatesLimit = 200,
101 searchHitsSampleLimit = 20000,
102 ignoreCollocateCase = FALSE,
103 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
104 exactFrequencies = TRUE,
105 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
106 seed = 7,
107 expand = length(vc) != length(node),
108 maxRecurse = 0,
109 addExamples = FALSE,
110 thresholdScore = "logDice",
111 threshold = 2.0,
112 localStopwords = c(),
113 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz9894a372025-10-18 14:51:29 +0200114 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200115 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200116 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200117 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200118
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200120 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
121 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200122
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200123 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 if (lemmatizeNodeQuery) {
126 node <- lemmatizeWordQuery(node)
127 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200128
Marc Kupietze34a8be2025-10-17 20:13:42 +0200129 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200130 if (is.null(vcNames)) {
131 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 }
133
134 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200135 if (!is.null(names(vc)) && length(vc) > 0) {
136 raw_names <- names(vc)
137 if (any(!is.na(raw_names) & raw_names != "")) {
138 label_lookup <- stats::setNames(raw_names, vc)
139 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200140 }
141
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200142 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200143 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200144 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
145 tmp_grid$vc <- vc[tmp_grid$idx]
146 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
147 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200148 } else {
149 tibble(node = node, vc = vc, vcLabel = vcNames)
150 }
151
152 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200153 collocationAnalysis(kco,
154 node = node,
155 vc = vc,
156 minOccur = minOccur,
157 leftContextSize = leftContextSize,
158 rightContextSize = rightContextSize,
159 topCollocatesLimit = topCollocatesLimit,
160 searchHitsSampleLimit = searchHitsSampleLimit,
161 ignoreCollocateCase = ignoreCollocateCase,
162 withinSpan = withinSpan,
163 exactFrequencies = exactFrequencies,
164 stopwords = stopwords,
165 addExamples = TRUE,
166 localStopwords = localStopwords,
167 seed = seed,
168 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200169 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200170 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200171 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200172 ...
173 )
174 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200175 bind_rows()
176
177 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
178 multi_result
179 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200180 if (!"label" %in% names(multi_result)) {
181 multi_result$label <- NA_character_
182 }
183
184 if (!is.null(label_lookup)) {
185 override <- unname(label_lookup[multi_result$vc])
186 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
187 if (any(missing_idx)) {
188 multi_result$label[missing_idx] <- override[missing_idx]
189 }
190 }
191
192 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
193 if (any(missing_idx)) {
194 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
195 }
196
Marc Kupietze31322e2025-10-17 18:55:36 +0200197 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200198 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200199 missingScoreQuantile = missingScoreQuantile
200 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200201 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200202 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200203 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
204 vcLabel <- vcNames[1]
205 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200206
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200207 set.seed(seed)
208 candidates <- collocatesQuery(
209 kco,
210 node,
211 vc = vc,
212 minOccur = minOccur,
213 leftContextSize = leftContextSize,
214 rightContextSize = rightContextSize,
215 searchHitsSampleLimit = searchHitsSampleLimit,
216 ignoreCollocateCase = ignoreCollocateCase,
217 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200218 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200219 ...
220 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200221
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200222 if (nrow(candidates) > 0) {
223 candidates <- candidates |>
224 filter(frequency >= minOccur) |>
225 slice_head(n = topCollocatesLimit)
226 collocationScoreQuery(
227 kco,
228 node = node,
229 collocate = candidates$word,
230 vc = vc,
231 leftContextSize = leftContextSize,
232 rightContextSize = rightContextSize,
233 observed = if (exactFrequencies) NA else candidates$frequency,
234 ignoreCollocateCase = ignoreCollocateCase,
235 withinSpan = withinSpan,
236 ...
237 ) |>
238 filter(O >= minOccur) |>
239 dplyr::arrange(dplyr::desc(logDice))
240 } else {
241 tibble()
242 }
243 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200244
245 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
246 result$label <- rep(vcLabel, nrow(result))
247 }
248
249 threshold_col <- thresholdScore
250 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
251 threshold_values <- result[[threshold_col]]
252 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
253 if (length(eligible_idx) > 0) {
254 recurseWith <- result[eligible_idx, , drop = FALSE]
255 result <- collocationAnalysis(
256 kco,
257 node = paste0("(", buildCollocationQuery(
258 removeWithinSpan(recurseWith$node, withinSpan),
259 recurseWith$collocate,
260 leftContextSize = leftContextSize,
261 rightContextSize = rightContextSize,
262 withinSpan = ""
263 ), ")"),
264 vc = vc,
265 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200266 leftContextSize = leftContextSize,
267 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200268 withinSpan = withinSpan,
269 maxRecurse = maxRecurse - 1,
270 stopwords = stopwords,
271 localStopwords = recurseWith$collocate,
272 exactFrequencies = exactFrequencies,
273 searchHitsSampleLimit = searchHitsSampleLimit,
274 topCollocatesLimit = topCollocatesLimit,
275 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200276 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200277 collocateFilterRegex = collocateFilterRegex,
278 vcLabel = vcLabel
279 ) |>
280 bind_rows(result) |>
281 filter(logDice >= 2) |>
282 filter(O >= minOccur) |>
283 dplyr::arrange(dplyr::desc(logDice))
284 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200285 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200286
287 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200288 result$query <- buildCollocationQuery(
289 result$node,
290 result$collocate,
291 leftContextSize = leftContextSize,
292 rightContextSize = rightContextSize,
293 withinSpan = withinSpan
294 )
295 result$example <- findExample(
296 kco,
297 query = result$query,
298 vc = result$vc
299 )
300 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200301
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200302 result
303 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200304)
305
Marc Kupietz76b05592021-12-19 16:26:15 +0100306# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100307removeWithinSpan <- function(query, withinSpan) {
308 if (withinSpan == "") {
309 return(query)
310 }
311 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200312 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100313 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200314 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100315 return(res)
316}
317
Marc Kupietz77852b22025-10-19 11:35:34 +0200318add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200319 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200320
321 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
322 return(result)
323 }
324
325 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
326 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
327 score_cols <- setdiff(numeric_cols, non_score_cols)
328
329 if (length(score_cols) == 0) {
330 return(result)
331 }
332
Marc Kupietz9894a372025-10-18 14:51:29 +0200333 compute_score_floor <- function(values) {
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200334 # Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz9894a372025-10-18 14:51:29 +0200335 finite_values <- values[is.finite(values)]
336 if (length(finite_values) == 0) {
337 return(0)
338 }
339
340 prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200341 # Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200342 q_val <- suppressWarnings(stats::quantile(finite_values,
343 probs = prob,
344 names = FALSE,
345 type = 7
346 ))
347
348 if (!is.finite(q_val)) {
349 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
350 }
351
352 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
353 if (!is.finite(min_val)) {
354 min_val <- 0
355 }
356
357 spread_candidates <- c(
358 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
359 stats::sd(finite_values, na.rm = TRUE),
360 abs(q_val) * 0.1,
361 abs(min_val - q_val)
362 )
363 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
364
365 spread <- 0
366 if (length(spread_candidates) > 0) {
367 spread <- max(spread_candidates)
368 }
369 if (!is.finite(spread) || spread == 0) {
370 spread <- max(abs(q_val), abs(min_val), 1e-06)
371 }
372
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200373 # Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200374 candidate <- q_val - spread
375 if (!is.finite(candidate)) {
376 candidate <- min_val
377 }
378
379 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
380 if (!is.finite(floor_value)) {
381 floor_value <- min_val
382 }
383 if (!is.finite(floor_value)) {
384 floor_value <- 0
385 }
386
387 floor_value
388 }
389
390 score_replacements <- stats::setNames(
391 vapply(score_cols, function(col) {
392 compute_score_floor(result[[col]])
393 }, numeric(1)),
394 score_cols
395 )
396
Marc Kupietzc4540a22025-10-14 17:39:53 +0200397 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200398 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
399 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200400 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200401 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200402 names_glue = "{.value}_{make.names(label)}",
403 values_fn = dplyr::first
404 )
405
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200406 raw_labels <- unique(result$label)
407 labels <- make.names(raw_labels)
408 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200409
Marc Kupietz28a29842025-10-18 12:25:09 +0200410 rank_data <- result |>
411 dplyr::distinct(node, collocate)
412
413 for (i in seq_along(raw_labels)) {
414 raw_lab <- raw_labels[i]
415 safe_lab <- labels[i]
416 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
417 if (nrow(label_df) == 0) {
418 next
419 }
420 label_df <- dplyr::distinct(label_df)
421 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
422 for (col in score_cols) {
423 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200424 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200425 values <- label_df[[col]]
426 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200427 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200428 valid_idx <- which(!is.na(values))
429 if (length(valid_idx) > 0) {
430 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200431 total <- length(valid_idx)
432 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200433 }
434 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200435 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200436 }
437 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
438 }
439
440 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
441
442 rank_replacements <- numeric(0)
443 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
444 if (length(rank_column_names) > 0) {
445 rank_replacements <- stats::setNames(
446 vapply(rank_column_names, function(col) {
447 col_values <- comparison[[col]]
448 valid_values <- col_values[!is.na(col_values)]
449 if (length(valid_values) == 0) {
450 nrow(comparison) + 1
451 } else {
452 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
453 }
454 }, numeric(1)),
455 rank_column_names
456 )
457 }
458
Marc Kupietz130a2a22025-10-18 16:09:23 +0200459 percentile_replacements <- numeric(0)
460 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
461 if (length(percentile_column_names) > 0) {
462 percentile_replacements <- stats::setNames(
463 rep(0, length(percentile_column_names)),
464 percentile_column_names
465 )
466 }
467
Marc Kupietz28a29842025-10-18 12:25:09 +0200468 collapse_label_values <- function(indices, safe_labels_vec) {
469 if (length(indices) == 0) {
470 return(NA_character_)
471 }
472 labs <- label_map[safe_labels_vec[indices]]
473 fallback <- safe_labels_vec[indices]
474 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
475 labs <- labs[!is.na(labs) & labs != ""]
476 if (length(labs) == 0) {
477 return(NA_character_)
478 }
479 paste(unique(labs), collapse = ", ")
480 }
481
Marc Kupietzc4540a22025-10-14 17:39:53 +0200482 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200483 fill_scores <- function(x, y, measure_col) {
484 replacement <- score_replacements[[measure_col]]
485 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
486 if (!is.finite(fallback_min)) {
487 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200488 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200489 if (!is.null(replacement) && is.finite(replacement)) {
490 replacement <- min(replacement, fallback_min)
491 } else {
492 replacement <- fallback_min
493 }
494 if (!is.finite(replacement)) {
495 replacement <- 0
496 }
497 if (any(is.na(x))) {
498 x[is.na(x)] <- replacement
499 }
500 if (any(is.na(y))) {
501 y[is.na(y)] <- replacement
502 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200503 list(x = x, y = y)
504 }
505
Marc Kupietz130a2a22025-10-18 16:09:23 +0200506 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
507 replacement_left <- percentile_replacements[[left_pct_col]]
508 if (is.null(replacement_left) || !is.finite(replacement_left)) {
509 replacement_left <- 0
510 }
511 replacement_right <- percentile_replacements[[right_pct_col]]
512 if (is.null(replacement_right) || !is.finite(replacement_right)) {
513 replacement_right <- 0
514 }
515 if (any(is.na(x))) {
516 x[is.na(x)] <- replacement_left
517 }
518 if (any(is.na(y))) {
519 y[is.na(y)] <- replacement_right
520 }
521 list(x = x, y = y)
522 }
523
Marc Kupietz28a29842025-10-18 12:25:09 +0200524 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
525 fallback <- nrow(comparison) + 1
526 replacement_left <- rank_replacements[[left_rank_col]]
527 if (is.null(replacement_left) || !is.finite(replacement_left)) {
528 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200529 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200530 replacement_right <- rank_replacements[[right_rank_col]]
531 if (is.null(replacement_right) || !is.finite(replacement_right)) {
532 replacement_right <- fallback
533 }
534 if (any(is.na(x))) {
535 x[is.na(x)] <- replacement_left
536 }
537 if (any(is.na(y))) {
538 y[is.na(y)] <- replacement_right
539 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200540 list(x = x, y = y)
541 }
542
543 left_label <- labels[1]
544 right_label <- labels[2]
545
546 for (col in score_cols) {
547 left_col <- paste0(col, "_", left_label)
548 right_col <- paste0(col, "_", right_label)
549 if (!all(c(left_col, right_col) %in% names(comparison))) {
550 next
551 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200552 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200553 comparison[[left_col]] <- filled$x
554 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200555 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200556 rank_left <- paste0("rank_", left_label, "_", col)
557 rank_right <- paste0("rank_", right_label, "_", col)
558 if (all(c(rank_left, rank_right) %in% names(comparison))) {
559 filled_rank <- fill_ranks(
560 comparison[[rank_left]],
561 comparison[[rank_right]],
562 rank_left,
563 rank_right
564 )
565 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
566 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200567 pct_left <- paste0("percentile_rank_", left_label, "_", col)
568 pct_right <- paste0("percentile_rank_", right_label, "_", col)
569 if (all(c(pct_left, pct_right) %in% names(comparison))) {
570 filled_pct <- fill_percentiles(
571 comparison[[pct_left]],
572 comparison[[pct_right]],
573 pct_left,
574 pct_right
575 )
576 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
577 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200578 }
579 }
580
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200581 for (col in score_cols) {
582 value_cols <- paste0(col, "_", labels)
583 existing <- value_cols %in% names(comparison)
584 if (!any(existing)) {
585 next
586 }
587 value_cols <- value_cols[existing]
588 safe_labels <- labels[existing]
589
590 score_values <- comparison[, value_cols, drop = FALSE]
591
592 winner_label_col <- paste0("winner_", col)
593 winner_value_col <- paste0("winner_", col, "_value")
594 runner_label_col <- paste0("runner_up_", col)
595 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200596 loser_label_col <- paste0("loser_", col)
597 loser_value_col <- paste0("loser_", col, "_value")
598 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200599
600 if (nrow(score_values) == 0) {
601 comparison[[winner_label_col]] <- character(0)
602 comparison[[winner_value_col]] <- numeric(0)
603 comparison[[runner_label_col]] <- character(0)
604 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200605 comparison[[loser_label_col]] <- character(0)
606 comparison[[loser_value_col]] <- numeric(0)
607 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200608 next
609 }
610
611 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200612 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200613
Marc Kupietzb2862d42025-10-18 10:17:49 +0200614 n_rows <- nrow(score_matrix)
615 winner_labels <- rep(NA_character_, n_rows)
616 winner_values <- rep(NA_real_, n_rows)
617 runner_labels <- rep(NA_character_, n_rows)
618 runner_values <- rep(NA_real_, n_rows)
619 loser_labels <- rep(NA_character_, n_rows)
620 loser_values <- rep(NA_real_, n_rows)
621 max_deltas <- rep(NA_real_, n_rows)
622
Marc Kupietzb2862d42025-10-18 10:17:49 +0200623 if (n_rows > 0) {
624 for (i in seq_len(n_rows)) {
625 numeric_row <- as.numeric(score_matrix[i, ])
626 if (all(is.na(numeric_row))) {
627 next
628 }
629
Marc Kupietz9894a372025-10-18 14:51:29 +0200630 replacement <- score_replacements[[col]]
631 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
632 if (!is.finite(fallback_min)) {
633 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200634 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200635 if (!is.null(replacement) && is.finite(replacement)) {
636 replacement <- min(replacement, fallback_min)
637 } else {
638 replacement <- fallback_min
639 }
640 if (!is.finite(replacement)) {
641 replacement <- 0
642 }
643 if (any(is.na(numeric_row))) {
644 numeric_row[is.na(numeric_row)] <- replacement
645 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200646 score_matrix[i, ] <- numeric_row
647
648 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
649 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200650 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200651 winner_values[i] <- max_val
652
653 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
654 if (length(unique_vals) >= 2) {
655 runner_val <- unique_vals[2]
656 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200657 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200658 runner_values[i] <- runner_val
659 }
660
661 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
662 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200663 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200664 loser_values[i] <- min_val
665
666 if (is.finite(max_val) && is.finite(min_val)) {
667 max_deltas[i] <- max_val - min_val
668 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200669 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200670 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200671
Marc Kupietzb2862d42025-10-18 10:17:49 +0200672 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200673 comparison[[winner_label_col]] <- winner_labels
674 comparison[[winner_value_col]] <- winner_values
675 comparison[[runner_label_col]] <- runner_labels
676 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200677 comparison[[loser_label_col]] <- loser_labels
678 comparison[[loser_value_col]] <- loser_values
679 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200680 }
681
Marc Kupietz28a29842025-10-18 12:25:09 +0200682 for (col in score_cols) {
683 rank_cols <- paste0("rank_", labels, "_", col)
684 existing <- rank_cols %in% names(comparison)
685 if (!any(existing)) {
686 next
687 }
688 rank_cols <- rank_cols[existing]
689 safe_labels <- labels[existing]
690 rank_values <- comparison[, rank_cols, drop = FALSE]
691
692 winner_rank_label_col <- paste0("winner_rank_", col)
693 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
694 runner_rank_label_col <- paste0("runner_up_rank_", col)
695 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
696 loser_rank_label_col <- paste0("loser_rank_", col)
697 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
698 max_delta_rank_col <- paste0("max_delta_rank_", col)
699
700 if (nrow(rank_values) == 0) {
701 comparison[[winner_rank_label_col]] <- character(0)
702 comparison[[winner_rank_value_col]] <- numeric(0)
703 comparison[[runner_rank_label_col]] <- character(0)
704 comparison[[runner_rank_value_col]] <- numeric(0)
705 comparison[[loser_rank_label_col]] <- character(0)
706 comparison[[loser_rank_value_col]] <- numeric(0)
707 comparison[[max_delta_rank_col]] <- numeric(0)
708 next
709 }
710
711 rank_matrix <- as.matrix(rank_values)
712 storage.mode(rank_matrix) <- "numeric"
713
714 n_rows <- nrow(rank_matrix)
715 winner_labels <- rep(NA_character_, n_rows)
716 winner_values <- rep(NA_real_, n_rows)
717 runner_labels <- rep(NA_character_, n_rows)
718 runner_values <- rep(NA_real_, n_rows)
719 loser_labels <- rep(NA_character_, n_rows)
720 loser_values <- rep(NA_real_, n_rows)
721 max_deltas <- rep(NA_real_, n_rows)
722
723 for (i in seq_len(n_rows)) {
724 numeric_row <- as.numeric(rank_matrix[i, ])
725 if (all(is.na(numeric_row))) {
726 next
727 }
728
729 if (length(rank_cols) > 0) {
730 replacement_vec <- rank_replacements[rank_cols]
731 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
732 missing_idx <- which(is.na(numeric_row))
733 if (length(missing_idx) > 0) {
734 numeric_row[missing_idx] <- replacement_vec[missing_idx]
735 }
736 }
737
738 valid_idx <- seq_along(numeric_row)
739 valid_values <- numeric_row[valid_idx]
740 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
741 min_positions <- valid_idx[which(valid_values == min_val)]
742 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
743 winner_values[i] <- min_val
744
745 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
746 if (length(ordered_vals) >= 2) {
747 runner_val <- ordered_vals[2]
748 runner_positions <- valid_idx[which(valid_values == runner_val)]
749 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
750 runner_values[i] <- runner_val
751 }
752
753 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
754 max_positions <- valid_idx[which(valid_values == max_val)]
755 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
756 loser_values[i] <- max_val
757
758 if (is.finite(max_val) && is.finite(min_val)) {
759 max_deltas[i] <- max_val - min_val
760 }
761 }
762
763 comparison[[winner_rank_label_col]] <- winner_labels
764 comparison[[winner_rank_value_col]] <- winner_values
765 comparison[[runner_rank_label_col]] <- runner_labels
766 comparison[[runner_rank_value_col]] <- runner_values
767 comparison[[loser_rank_label_col]] <- loser_labels
768 comparison[[loser_rank_value_col]] <- loser_values
769 comparison[[max_delta_rank_col]] <- max_deltas
770 }
771
Marc Kupietz130a2a22025-10-18 16:09:23 +0200772 for (col in score_cols) {
773 pct_cols <- paste0("percentile_rank_", labels, "_", col)
774 existing <- pct_cols %in% names(comparison)
775 if (!any(existing)) {
776 next
777 }
778 pct_cols <- pct_cols[existing]
779 safe_labels <- labels[existing]
780 pct_values <- comparison[, pct_cols, drop = FALSE]
781
782 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
783 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
784 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
785 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
786 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
787 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
788 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
789
790 if (nrow(pct_values) == 0) {
791 comparison[[winner_pct_label_col]] <- character(0)
792 comparison[[winner_pct_value_col]] <- numeric(0)
793 comparison[[runner_pct_label_col]] <- character(0)
794 comparison[[runner_pct_value_col]] <- numeric(0)
795 comparison[[loser_pct_label_col]] <- character(0)
796 comparison[[loser_pct_value_col]] <- numeric(0)
797 comparison[[max_delta_pct_col]] <- numeric(0)
798 next
799 }
800
801 pct_matrix <- as.matrix(pct_values)
802 storage.mode(pct_matrix) <- "numeric"
803
804 n_rows <- nrow(pct_matrix)
805 winner_labels <- rep(NA_character_, n_rows)
806 winner_values <- rep(NA_real_, n_rows)
807 runner_labels <- rep(NA_character_, n_rows)
808 runner_values <- rep(NA_real_, n_rows)
809 loser_labels <- rep(NA_character_, n_rows)
810 loser_values <- rep(NA_real_, n_rows)
811 max_deltas <- rep(NA_real_, n_rows)
812
813 if (n_rows > 0) {
814 for (i in seq_len(n_rows)) {
815 numeric_row <- as.numeric(pct_matrix[i, ])
816 if (all(is.na(numeric_row))) {
817 next
818 }
819
820 if (any(is.na(numeric_row))) {
821 numeric_row[is.na(numeric_row)] <- 0
822 }
823 pct_matrix[i, ] <- numeric_row
824
825 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
826 max_idx <- which(numeric_row == max_val)
827 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
828 winner_values[i] <- max_val
829
830 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
831 if (length(unique_vals) >= 2) {
832 runner_val <- unique_vals[2]
833 runner_idx <- which(numeric_row == runner_val)
834 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
835 runner_values[i] <- runner_val
836 }
837
838 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
839 min_idx <- which(numeric_row == min_val)
840 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
841 loser_values[i] <- min_val
842
843 if (is.finite(max_val) && is.finite(min_val)) {
844 max_deltas[i] <- max_val - min_val
845 }
846 }
847 }
848
849 comparison[, pct_cols] <- pct_matrix
850 comparison[[winner_pct_label_col]] <- winner_labels
851 comparison[[winner_pct_value_col]] <- winner_values
852 comparison[[runner_pct_label_col]] <- runner_labels
853 comparison[[runner_pct_value_col]] <- runner_values
854 comparison[[loser_pct_label_col]] <- loser_labels
855 comparison[[loser_pct_value_col]] <- loser_values
856 comparison[[max_delta_pct_col]] <- max_deltas
857 }
858
Marc Kupietzc4540a22025-10-14 17:39:53 +0200859 dplyr::left_join(result, comparison, by = c("node", "collocate"))
860}
861
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200862#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200863#' @importFrom stringr str_detect
864#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
865#'
866matches2FreqTable <- function(matches,
867 index = 0,
868 minOccur = 5,
869 leftContextSize = 5,
870 rightContextSize = 5,
871 ignoreCollocateCase = FALSE,
872 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200873 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200874 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
875 verbose = TRUE) {
876 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
877 frequency <- NULL
878
879 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200880 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200881 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200882 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200883 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200884 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
885 ignoreCollocateCase = ignoreCollocateCase,
886 stopwords = stopwords, oldTable = oldTable, verbose = verbose
887 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200888 }
889 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200890 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200891 oldTable <- matches2FreqTable(
892 matches,
893 i,
894 leftContextSize = leftContextSize,
895 rightContextSize = rightContextSize,
896 collocateFilterRegex = collocateFilterRegex,
897 oldTable = oldTable,
898 stopwords = stopwords
899 )
900 }
901 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200902 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100903 group_by(word) |>
904 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200905 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200906 arrange(desc(frequency))
907 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200908 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200909
910 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
911
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200912 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200913
914 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
915
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200916 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200917
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200918 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200919 oldTable
920 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100921 table(c(left, right)) |>
922 dplyr::as_tibble(.name_repair = "minimal") |>
923 dplyr::rename(word = 1, frequency = 2) |>
924 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200925 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200926 dplyr::bind_rows(oldTable)
927 }
928 }
929}
930
931#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200932#' @importFrom stringr str_match str_split str_detect
933#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
934#'
935snippet2FreqTable <- function(snippet,
936 minOccur = 5,
937 leftContextSize = 5,
938 rightContextSize = 5,
939 ignoreCollocateCase = FALSE,
940 stopwords = c(),
941 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200942 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200943 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
944 verbose = TRUE) {
945 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
946 frequency <- NULL
947
948 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200949 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200950 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200951 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200952 for (s in snippet) {
953 oldTable <- snippet2FreqTable(
954 s,
955 leftContextSize = leftContextSize,
956 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100957 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200958 oldTable = oldTable,
959 stopwords = stopwords
960 )
961 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200962 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200963 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100964 group_by(word) |>
965 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200966 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200967 arrange(desc(frequency))
968 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200969 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200970 match <-
971 str_match(
972 snippet,
973 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
974 )
975
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200976 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200977 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200978 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200979 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200980 }
981 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200982
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200983 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200984 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200985 } else {
986 ""
987 }
988 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200989
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200990 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200991 oldTable
992 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100993 table(c(left, right)) |>
994 dplyr::as_tibble(.name_repair = "minimal") |>
995 dplyr::rename(word = 1, frequency = 2) |>
996 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200997 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200998 dplyr::bind_rows(oldTable)
999 }
1000 }
1001}
1002
1003#' Preliminary synsemantic stopwords function
1004#'
1005#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001006#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001007#'
1008#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1009#'
1010#' @details
1011#' Currently only suitable for German. See stopwords package for other languages.
1012#'
1013#' @param ... future arguments for language detection
1014#'
1015#' @family collocation analysis functions
1016#' @return Vector of synsemantic stopwords.
1017#' @export
1018synsemanticStopwords <- function(...) {
1019 res <- c(
1020 "der",
1021 "die",
1022 "und",
1023 "in",
1024 "den",
1025 "von",
1026 "mit",
1027 "das",
1028 "zu",
1029 "im",
1030 "ist",
1031 "auf",
1032 "sich",
1033 "Die",
1034 "des",
1035 "dem",
1036 "nicht",
1037 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001038 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001039 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001040 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001041 "es",
1042 "auch",
1043 "an",
1044 "als",
1045 "am",
1046 "aus",
1047 "Der",
1048 "bei",
1049 "er",
1050 "dass",
1051 "sie",
1052 "nach",
1053 "um",
1054 "Das",
1055 "zum",
1056 "noch",
1057 "war",
1058 "einen",
1059 "einer",
1060 "wie",
1061 "einem",
1062 "vor",
1063 "bis",
1064 "\u00fcber",
1065 "so",
1066 "aber",
1067 "Eine",
1068 "diese",
1069 "Diese",
Marc Kupietz130a2a22025-10-18 16:09:23 +02001070 "oder",
1071 "Es",
1072 "Und"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001073 )
1074 return(res)
1075}
1076
Marc Kupietz5a336b62021-11-27 17:51:35 +01001077
Marc Kupietz76b05592021-12-19 16:26:15 +01001078# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001079findExample <-
1080 function(kco,
1081 query,
1082 vc = "",
1083 matchOnly = TRUE) {
1084 out <- character(length = length(query))
1085
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001086 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001087 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001088 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001089
1090 for (i in seq_along(query)) {
1091 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001092 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001093 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001094 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001095 out[i] <- if (matchOnly) {
1096 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001097 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001098 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001099 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001100 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001101 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001102 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001103 }
1104 out
1105 }
1106
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001107collocatesQuery <-
1108 function(kco,
1109 query,
1110 vc = "",
1111 minOccur = 5,
1112 leftContextSize = 5,
1113 rightContextSize = 5,
1114 searchHitsSampleLimit = 20000,
1115 ignoreCollocateCase = FALSE,
1116 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001117 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001118 ...) {
1119 frequency <- NULL
1120 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001121 if (q@totalResults == 0) {
1122 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001123 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001124 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1125 matches2FreqTable(q@collectedMatches,
1126 0,
1127 minOccur = minOccur,
1128 leftContextSize = leftContextSize,
1129 rightContextSize = rightContextSize,
1130 ignoreCollocateCase = ignoreCollocateCase,
1131 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001132 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001133 ...,
1134 verbose = kco@verbose
1135 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001136 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001137 filter(frequency >= minOccur)
1138 }
1139 }