blob: 8435c6c13f5307c61cfa4a54184fa5ba67ed4c3f [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzde679ea2025-10-19 13:14:51 +020045#' @param queryMissingScores if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
Marc Kupietz4cbb5472025-10-19 12:15:25 +020046#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietze34a8be2025-10-17 20:13:42 +020047#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020048#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020050#' @return
51#' A tibble where each row represents a candidate collocate for the requested node.
52#' Columns include (depending on the selected association measures):
53#'
54#' \itemize{
55#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
56#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
57#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
58#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
59#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
60#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
61#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
62#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
63#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020064#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020066#' @importFrom tidyr expand_grid pivot_wider
67#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020068#'
69#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020070#' \dontrun{
71#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020072#' # Find top collocates of "Packung" inside and outside the sports domain.
73#' KorAPConnection(verbose = TRUE) |>
74#' collocationAnalysis("Packung",
75#' vc = c("textClass=sport", "textClass!=sport"),
76#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
77#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020078#' dplyr::filter(logDice >= 5)
79#' }
80#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020081#' \dontrun{
82#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' # Identify the most prominent light verb construction with "in ... setzen".
84#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010085#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020087#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
88#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#' }
90#'
91#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020092setMethod(
93 "collocationAnalysis", "KorAPConnection",
94 function(kco,
95 node,
96 vc = "",
97 lemmatizeNodeQuery = FALSE,
98 minOccur = 5,
99 leftContextSize = 5,
100 rightContextSize = 5,
101 topCollocatesLimit = 200,
102 searchHitsSampleLimit = 20000,
103 ignoreCollocateCase = FALSE,
104 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
105 exactFrequencies = TRUE,
106 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
107 seed = 7,
108 expand = length(vc) != length(node),
109 maxRecurse = 0,
110 addExamples = FALSE,
111 thresholdScore = "logDice",
112 threshold = 2.0,
113 localStopwords = c(),
114 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzde679ea2025-10-19 13:14:51 +0200115 queryMissingScores = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200116 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200118 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200120
Marc Kupietzb2862d42025-10-18 10:17:49 +0200121 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200122 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
123 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200126
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200127 if (lemmatizeNodeQuery) {
128 node <- lemmatizeWordQuery(node)
129 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200130
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 if (is.null(vcNames)) {
133 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200134 }
135
136 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200137 if (!is.null(names(vc)) && length(vc) > 0) {
138 raw_names <- names(vc)
139 if (any(!is.na(raw_names) & raw_names != "")) {
140 label_lookup <- stats::setNames(raw_names, vc)
141 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200142 }
143
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200144 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200145 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200146 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
147 tmp_grid$vc <- vc[tmp_grid$idx]
148 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
149 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200150 } else {
151 tibble(node = node, vc = vc, vcLabel = vcNames)
152 }
153
154 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200155 collocationAnalysis(kco,
156 node = node,
157 vc = vc,
158 minOccur = minOccur,
159 leftContextSize = leftContextSize,
160 rightContextSize = rightContextSize,
161 topCollocatesLimit = topCollocatesLimit,
162 searchHitsSampleLimit = searchHitsSampleLimit,
163 ignoreCollocateCase = ignoreCollocateCase,
164 withinSpan = withinSpan,
165 exactFrequencies = exactFrequencies,
166 stopwords = stopwords,
167 addExamples = TRUE,
168 localStopwords = localStopwords,
169 seed = seed,
170 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200171 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200172 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200173 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200174 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200175 ...
176 )
177 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200178 bind_rows()
179
180 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
181 multi_result
182 } else {
Marc Kupietzde679ea2025-10-19 13:14:51 +0200183 if (queryMissingScores) {
184 multi_result <- backfill_missing_scores(
185 multi_result,
186 grid = grid,
187 kco = kco,
188 ignoreCollocateCase = ignoreCollocateCase,
189 ...
190 )
191 }
192
Marc Kupietze34a8be2025-10-17 20:13:42 +0200193 if (!"label" %in% names(multi_result)) {
194 multi_result$label <- NA_character_
195 }
196
197 if (!is.null(label_lookup)) {
198 override <- unname(label_lookup[multi_result$vc])
199 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
200 if (any(missing_idx)) {
201 multi_result$label[missing_idx] <- override[missing_idx]
202 }
203 }
204
205 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
206 if (any(missing_idx)) {
207 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
208 }
209
Marc Kupietze31322e2025-10-17 18:55:36 +0200210 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200211 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200212 missingScoreQuantile = missingScoreQuantile
213 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200214 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200215 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200216 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
217 vcLabel <- vcNames[1]
218 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200219
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200220 set.seed(seed)
221 candidates <- collocatesQuery(
222 kco,
223 node,
224 vc = vc,
225 minOccur = minOccur,
226 leftContextSize = leftContextSize,
227 rightContextSize = rightContextSize,
228 searchHitsSampleLimit = searchHitsSampleLimit,
229 ignoreCollocateCase = ignoreCollocateCase,
230 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200231 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200232 ...
233 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200234
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200235 if (nrow(candidates) > 0) {
236 candidates <- candidates |>
237 filter(frequency >= minOccur) |>
238 slice_head(n = topCollocatesLimit)
239 collocationScoreQuery(
240 kco,
241 node = node,
242 collocate = candidates$word,
243 vc = vc,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 observed = if (exactFrequencies) NA else candidates$frequency,
247 ignoreCollocateCase = ignoreCollocateCase,
248 withinSpan = withinSpan,
249 ...
250 ) |>
251 filter(O >= minOccur) |>
252 dplyr::arrange(dplyr::desc(logDice))
253 } else {
254 tibble()
255 }
256 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200257
258 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
259 result$label <- rep(vcLabel, nrow(result))
260 }
261
262 threshold_col <- thresholdScore
263 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
264 threshold_values <- result[[threshold_col]]
265 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
266 if (length(eligible_idx) > 0) {
267 recurseWith <- result[eligible_idx, , drop = FALSE]
268 result <- collocationAnalysis(
269 kco,
270 node = paste0("(", buildCollocationQuery(
271 removeWithinSpan(recurseWith$node, withinSpan),
272 recurseWith$collocate,
273 leftContextSize = leftContextSize,
274 rightContextSize = rightContextSize,
275 withinSpan = ""
276 ), ")"),
277 vc = vc,
278 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200279 leftContextSize = leftContextSize,
280 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200281 withinSpan = withinSpan,
282 maxRecurse = maxRecurse - 1,
283 stopwords = stopwords,
284 localStopwords = recurseWith$collocate,
285 exactFrequencies = exactFrequencies,
286 searchHitsSampleLimit = searchHitsSampleLimit,
287 topCollocatesLimit = topCollocatesLimit,
288 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200289 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200290 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200291 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200292 vcLabel = vcLabel
293 ) |>
294 bind_rows(result) |>
295 filter(logDice >= 2) |>
296 filter(O >= minOccur) |>
297 dplyr::arrange(dplyr::desc(logDice))
298 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200299 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200300
301 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200302 result$query <- buildCollocationQuery(
303 result$node,
304 result$collocate,
305 leftContextSize = leftContextSize,
306 rightContextSize = rightContextSize,
307 withinSpan = withinSpan
308 )
309 result$example <- findExample(
310 kco,
311 query = result$query,
312 vc = result$vc
313 )
314 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200315
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200316 result
317 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200318)
319
Marc Kupietz76b05592021-12-19 16:26:15 +0100320# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100321removeWithinSpan <- function(query, withinSpan) {
322 if (withinSpan == "") {
323 return(query)
324 }
325 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200326 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100327 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200328 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100329 return(res)
330}
331
Marc Kupietzde679ea2025-10-19 13:14:51 +0200332backfill_missing_scores <- function(result,
333 grid,
334 kco,
335 ignoreCollocateCase,
336 ...) {
337 if (!"vc" %in% names(result) || !"node" %in% names(result) || !"collocate" %in% names(result)) {
338 return(result)
339 }
340
341 if (nrow(result) == 0) {
342 return(result)
343 }
344
345 distinct_pairs <- dplyr::distinct(result, node, collocate)
346 if (nrow(distinct_pairs) == 0) {
347 return(result)
348 }
349
350 collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
351 if (length(collocates_by_node) == 0) {
352 return(result)
353 }
354
355 required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
356 for (i in seq_len(nrow(required_combinations))) {
357 node_value <- required_combinations$node[i]
358 vc_value <- required_combinations$vc[i]
359
360 collocate_pool <- collocates_by_node[[node_value]]
361 if (is.null(collocate_pool) || length(collocate_pool) == 0) {
362 next
363 }
364
365 existing_idx <- result$node == node_value & result$vc == vc_value
366 existing_collocates <- unique(as.character(result$collocate[existing_idx]))
367 missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
368 missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
369
370 if (length(missing_collocates) == 0) {
371 next
372 }
373
374 context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
375 if (nrow(context_rows) == 0) {
376 context_rows <- result[result$node == node_value, , drop = FALSE]
377 }
378
379 left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
380 if (is.na(left_size) || length(left_size) == 0) {
381 left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
382 }
383 if (is.na(left_size) || length(left_size) == 0) {
384 left_size <- 5
385 }
386
387 right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
388 if (is.na(right_size) || length(right_size) == 0) {
389 right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
390 }
391 if (is.na(right_size) || length(right_size) == 0) {
392 right_size <- 5
393 }
394
395 within_span_value <- ""
396 if ("query" %in% names(context_rows)) {
397 query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
398 if (!is.na(query_candidate) && nzchar(query_candidate)) {
399 match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
400 matches <- regmatches(query_candidate, match_one)
401 if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
402 within_span_value <- matches[[1]][2]
403 }
404 }
405 }
406
407 new_rows <- collocationScoreQuery(
408 kco,
409 node = node_value,
410 collocate = missing_collocates,
411 vc = vc_value,
412 leftContextSize = left_size,
413 rightContextSize = right_size,
414 ignoreCollocateCase = ignoreCollocateCase,
415 withinSpan = within_span_value,
416 ...
417 )
418
419 if (nrow(new_rows) == 0) {
420 next
421 }
422
423 if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
424 new_rows$label <- required_combinations$vcLabel[i]
425 }
426
427 result <- dplyr::bind_rows(result, new_rows)
428 }
429
430 result
431}
432
Marc Kupietz77852b22025-10-19 11:35:34 +0200433add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200434 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200435
436 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
437 return(result)
438 }
439
440 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
441 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
442 score_cols <- setdiff(numeric_cols, non_score_cols)
443
444 if (length(score_cols) == 0) {
445 return(result)
446 }
447
Marc Kupietz9894a372025-10-18 14:51:29 +0200448 compute_score_floor <- function(values) {
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200449 # Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz9894a372025-10-18 14:51:29 +0200450 finite_values <- values[is.finite(values)]
451 if (length(finite_values) == 0) {
452 return(0)
453 }
454
455 prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200456 # Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200457 q_val <- suppressWarnings(stats::quantile(finite_values,
458 probs = prob,
459 names = FALSE,
460 type = 7
461 ))
462
463 if (!is.finite(q_val)) {
464 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
465 }
466
467 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
468 if (!is.finite(min_val)) {
469 min_val <- 0
470 }
471
472 spread_candidates <- c(
473 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
474 stats::sd(finite_values, na.rm = TRUE),
475 abs(q_val) * 0.1,
476 abs(min_val - q_val)
477 )
478 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
479
480 spread <- 0
481 if (length(spread_candidates) > 0) {
482 spread <- max(spread_candidates)
483 }
484 if (!is.finite(spread) || spread == 0) {
485 spread <- max(abs(q_val), abs(min_val), 1e-06)
486 }
487
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200488 # Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200489 candidate <- q_val - spread
490 if (!is.finite(candidate)) {
491 candidate <- min_val
492 }
493
494 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
495 if (!is.finite(floor_value)) {
496 floor_value <- min_val
497 }
498 if (!is.finite(floor_value)) {
499 floor_value <- 0
500 }
501
502 floor_value
503 }
504
505 score_replacements <- stats::setNames(
506 vapply(score_cols, function(col) {
507 compute_score_floor(result[[col]])
508 }, numeric(1)),
509 score_cols
510 )
511
Marc Kupietzc4540a22025-10-14 17:39:53 +0200512 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200513 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
514 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200515 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200516 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200517 names_glue = "{.value}_{make.names(label)}",
518 values_fn = dplyr::first
519 )
520
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200521 raw_labels <- unique(result$label)
522 labels <- make.names(raw_labels)
523 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200524
Marc Kupietz28a29842025-10-18 12:25:09 +0200525 rank_data <- result |>
526 dplyr::distinct(node, collocate)
527
528 for (i in seq_along(raw_labels)) {
529 raw_lab <- raw_labels[i]
530 safe_lab <- labels[i]
531 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
532 if (nrow(label_df) == 0) {
533 next
534 }
535 label_df <- dplyr::distinct(label_df)
536 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
537 for (col in score_cols) {
538 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200539 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200540 values <- label_df[[col]]
541 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200542 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200543 valid_idx <- which(!is.na(values))
544 if (length(valid_idx) > 0) {
545 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200546 total <- length(valid_idx)
547 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200548 }
549 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200550 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200551 }
552 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
553 }
554
555 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
556
557 rank_replacements <- numeric(0)
558 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
559 if (length(rank_column_names) > 0) {
560 rank_replacements <- stats::setNames(
561 vapply(rank_column_names, function(col) {
562 col_values <- comparison[[col]]
563 valid_values <- col_values[!is.na(col_values)]
564 if (length(valid_values) == 0) {
565 nrow(comparison) + 1
566 } else {
567 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
568 }
569 }, numeric(1)),
570 rank_column_names
571 )
572 }
573
Marc Kupietz130a2a22025-10-18 16:09:23 +0200574 percentile_replacements <- numeric(0)
575 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
576 if (length(percentile_column_names) > 0) {
577 percentile_replacements <- stats::setNames(
578 rep(0, length(percentile_column_names)),
579 percentile_column_names
580 )
581 }
582
Marc Kupietz28a29842025-10-18 12:25:09 +0200583 collapse_label_values <- function(indices, safe_labels_vec) {
584 if (length(indices) == 0) {
585 return(NA_character_)
586 }
587 labs <- label_map[safe_labels_vec[indices]]
588 fallback <- safe_labels_vec[indices]
589 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
590 labs <- labs[!is.na(labs) & labs != ""]
591 if (length(labs) == 0) {
592 return(NA_character_)
593 }
594 paste(unique(labs), collapse = ", ")
595 }
596
Marc Kupietzc4540a22025-10-14 17:39:53 +0200597 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200598 fill_scores <- function(x, y, measure_col) {
599 replacement <- score_replacements[[measure_col]]
600 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
601 if (!is.finite(fallback_min)) {
602 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200603 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200604 if (!is.null(replacement) && is.finite(replacement)) {
605 replacement <- min(replacement, fallback_min)
606 } else {
607 replacement <- fallback_min
608 }
609 if (!is.finite(replacement)) {
610 replacement <- 0
611 }
612 if (any(is.na(x))) {
613 x[is.na(x)] <- replacement
614 }
615 if (any(is.na(y))) {
616 y[is.na(y)] <- replacement
617 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200618 list(x = x, y = y)
619 }
620
Marc Kupietz130a2a22025-10-18 16:09:23 +0200621 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
622 replacement_left <- percentile_replacements[[left_pct_col]]
623 if (is.null(replacement_left) || !is.finite(replacement_left)) {
624 replacement_left <- 0
625 }
626 replacement_right <- percentile_replacements[[right_pct_col]]
627 if (is.null(replacement_right) || !is.finite(replacement_right)) {
628 replacement_right <- 0
629 }
630 if (any(is.na(x))) {
631 x[is.na(x)] <- replacement_left
632 }
633 if (any(is.na(y))) {
634 y[is.na(y)] <- replacement_right
635 }
636 list(x = x, y = y)
637 }
638
Marc Kupietz28a29842025-10-18 12:25:09 +0200639 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
640 fallback <- nrow(comparison) + 1
641 replacement_left <- rank_replacements[[left_rank_col]]
642 if (is.null(replacement_left) || !is.finite(replacement_left)) {
643 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200644 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200645 replacement_right <- rank_replacements[[right_rank_col]]
646 if (is.null(replacement_right) || !is.finite(replacement_right)) {
647 replacement_right <- fallback
648 }
649 if (any(is.na(x))) {
650 x[is.na(x)] <- replacement_left
651 }
652 if (any(is.na(y))) {
653 y[is.na(y)] <- replacement_right
654 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200655 list(x = x, y = y)
656 }
657
658 left_label <- labels[1]
659 right_label <- labels[2]
660
661 for (col in score_cols) {
662 left_col <- paste0(col, "_", left_label)
663 right_col <- paste0(col, "_", right_label)
664 if (!all(c(left_col, right_col) %in% names(comparison))) {
665 next
666 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200667 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200668 comparison[[left_col]] <- filled$x
669 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200670 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200671 rank_left <- paste0("rank_", left_label, "_", col)
672 rank_right <- paste0("rank_", right_label, "_", col)
673 if (all(c(rank_left, rank_right) %in% names(comparison))) {
674 filled_rank <- fill_ranks(
675 comparison[[rank_left]],
676 comparison[[rank_right]],
677 rank_left,
678 rank_right
679 )
680 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
681 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200682 pct_left <- paste0("percentile_rank_", left_label, "_", col)
683 pct_right <- paste0("percentile_rank_", right_label, "_", col)
684 if (all(c(pct_left, pct_right) %in% names(comparison))) {
685 filled_pct <- fill_percentiles(
686 comparison[[pct_left]],
687 comparison[[pct_right]],
688 pct_left,
689 pct_right
690 )
691 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
692 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200693 }
694 }
695
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200696 for (col in score_cols) {
697 value_cols <- paste0(col, "_", labels)
698 existing <- value_cols %in% names(comparison)
699 if (!any(existing)) {
700 next
701 }
702 value_cols <- value_cols[existing]
703 safe_labels <- labels[existing]
704
705 score_values <- comparison[, value_cols, drop = FALSE]
706
707 winner_label_col <- paste0("winner_", col)
708 winner_value_col <- paste0("winner_", col, "_value")
709 runner_label_col <- paste0("runner_up_", col)
710 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200711 loser_label_col <- paste0("loser_", col)
712 loser_value_col <- paste0("loser_", col, "_value")
713 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200714
715 if (nrow(score_values) == 0) {
716 comparison[[winner_label_col]] <- character(0)
717 comparison[[winner_value_col]] <- numeric(0)
718 comparison[[runner_label_col]] <- character(0)
719 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200720 comparison[[loser_label_col]] <- character(0)
721 comparison[[loser_value_col]] <- numeric(0)
722 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200723 next
724 }
725
726 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200727 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200728
Marc Kupietzb2862d42025-10-18 10:17:49 +0200729 n_rows <- nrow(score_matrix)
730 winner_labels <- rep(NA_character_, n_rows)
731 winner_values <- rep(NA_real_, n_rows)
732 runner_labels <- rep(NA_character_, n_rows)
733 runner_values <- rep(NA_real_, n_rows)
734 loser_labels <- rep(NA_character_, n_rows)
735 loser_values <- rep(NA_real_, n_rows)
736 max_deltas <- rep(NA_real_, n_rows)
737
Marc Kupietzb2862d42025-10-18 10:17:49 +0200738 if (n_rows > 0) {
739 for (i in seq_len(n_rows)) {
740 numeric_row <- as.numeric(score_matrix[i, ])
741 if (all(is.na(numeric_row))) {
742 next
743 }
744
Marc Kupietz9894a372025-10-18 14:51:29 +0200745 replacement <- score_replacements[[col]]
746 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
747 if (!is.finite(fallback_min)) {
748 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200749 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200750 if (!is.null(replacement) && is.finite(replacement)) {
751 replacement <- min(replacement, fallback_min)
752 } else {
753 replacement <- fallback_min
754 }
755 if (!is.finite(replacement)) {
756 replacement <- 0
757 }
758 if (any(is.na(numeric_row))) {
759 numeric_row[is.na(numeric_row)] <- replacement
760 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200761 score_matrix[i, ] <- numeric_row
762
763 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
764 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200765 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200766 winner_values[i] <- max_val
767
768 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
769 if (length(unique_vals) >= 2) {
770 runner_val <- unique_vals[2]
771 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200772 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200773 runner_values[i] <- runner_val
774 }
775
776 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
777 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200778 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200779 loser_values[i] <- min_val
780
781 if (is.finite(max_val) && is.finite(min_val)) {
782 max_deltas[i] <- max_val - min_val
783 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200784 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200785 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200786
Marc Kupietzb2862d42025-10-18 10:17:49 +0200787 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200788 comparison[[winner_label_col]] <- winner_labels
789 comparison[[winner_value_col]] <- winner_values
790 comparison[[runner_label_col]] <- runner_labels
791 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200792 comparison[[loser_label_col]] <- loser_labels
793 comparison[[loser_value_col]] <- loser_values
794 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200795 }
796
Marc Kupietz28a29842025-10-18 12:25:09 +0200797 for (col in score_cols) {
798 rank_cols <- paste0("rank_", labels, "_", col)
799 existing <- rank_cols %in% names(comparison)
800 if (!any(existing)) {
801 next
802 }
803 rank_cols <- rank_cols[existing]
804 safe_labels <- labels[existing]
805 rank_values <- comparison[, rank_cols, drop = FALSE]
806
807 winner_rank_label_col <- paste0("winner_rank_", col)
808 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
809 runner_rank_label_col <- paste0("runner_up_rank_", col)
810 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
811 loser_rank_label_col <- paste0("loser_rank_", col)
812 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
813 max_delta_rank_col <- paste0("max_delta_rank_", col)
814
815 if (nrow(rank_values) == 0) {
816 comparison[[winner_rank_label_col]] <- character(0)
817 comparison[[winner_rank_value_col]] <- numeric(0)
818 comparison[[runner_rank_label_col]] <- character(0)
819 comparison[[runner_rank_value_col]] <- numeric(0)
820 comparison[[loser_rank_label_col]] <- character(0)
821 comparison[[loser_rank_value_col]] <- numeric(0)
822 comparison[[max_delta_rank_col]] <- numeric(0)
823 next
824 }
825
826 rank_matrix <- as.matrix(rank_values)
827 storage.mode(rank_matrix) <- "numeric"
828
829 n_rows <- nrow(rank_matrix)
830 winner_labels <- rep(NA_character_, n_rows)
831 winner_values <- rep(NA_real_, n_rows)
832 runner_labels <- rep(NA_character_, n_rows)
833 runner_values <- rep(NA_real_, n_rows)
834 loser_labels <- rep(NA_character_, n_rows)
835 loser_values <- rep(NA_real_, n_rows)
836 max_deltas <- rep(NA_real_, n_rows)
837
838 for (i in seq_len(n_rows)) {
839 numeric_row <- as.numeric(rank_matrix[i, ])
840 if (all(is.na(numeric_row))) {
841 next
842 }
843
844 if (length(rank_cols) > 0) {
845 replacement_vec <- rank_replacements[rank_cols]
846 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
847 missing_idx <- which(is.na(numeric_row))
848 if (length(missing_idx) > 0) {
849 numeric_row[missing_idx] <- replacement_vec[missing_idx]
850 }
851 }
852
853 valid_idx <- seq_along(numeric_row)
854 valid_values <- numeric_row[valid_idx]
855 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
856 min_positions <- valid_idx[which(valid_values == min_val)]
857 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
858 winner_values[i] <- min_val
859
860 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
861 if (length(ordered_vals) >= 2) {
862 runner_val <- ordered_vals[2]
863 runner_positions <- valid_idx[which(valid_values == runner_val)]
864 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
865 runner_values[i] <- runner_val
866 }
867
868 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
869 max_positions <- valid_idx[which(valid_values == max_val)]
870 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
871 loser_values[i] <- max_val
872
873 if (is.finite(max_val) && is.finite(min_val)) {
874 max_deltas[i] <- max_val - min_val
875 }
876 }
877
878 comparison[[winner_rank_label_col]] <- winner_labels
879 comparison[[winner_rank_value_col]] <- winner_values
880 comparison[[runner_rank_label_col]] <- runner_labels
881 comparison[[runner_rank_value_col]] <- runner_values
882 comparison[[loser_rank_label_col]] <- loser_labels
883 comparison[[loser_rank_value_col]] <- loser_values
884 comparison[[max_delta_rank_col]] <- max_deltas
885 }
886
Marc Kupietz130a2a22025-10-18 16:09:23 +0200887 for (col in score_cols) {
888 pct_cols <- paste0("percentile_rank_", labels, "_", col)
889 existing <- pct_cols %in% names(comparison)
890 if (!any(existing)) {
891 next
892 }
893 pct_cols <- pct_cols[existing]
894 safe_labels <- labels[existing]
895 pct_values <- comparison[, pct_cols, drop = FALSE]
896
897 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
898 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
899 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
900 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
901 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
902 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
903 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
904
905 if (nrow(pct_values) == 0) {
906 comparison[[winner_pct_label_col]] <- character(0)
907 comparison[[winner_pct_value_col]] <- numeric(0)
908 comparison[[runner_pct_label_col]] <- character(0)
909 comparison[[runner_pct_value_col]] <- numeric(0)
910 comparison[[loser_pct_label_col]] <- character(0)
911 comparison[[loser_pct_value_col]] <- numeric(0)
912 comparison[[max_delta_pct_col]] <- numeric(0)
913 next
914 }
915
916 pct_matrix <- as.matrix(pct_values)
917 storage.mode(pct_matrix) <- "numeric"
918
919 n_rows <- nrow(pct_matrix)
920 winner_labels <- rep(NA_character_, n_rows)
921 winner_values <- rep(NA_real_, n_rows)
922 runner_labels <- rep(NA_character_, n_rows)
923 runner_values <- rep(NA_real_, n_rows)
924 loser_labels <- rep(NA_character_, n_rows)
925 loser_values <- rep(NA_real_, n_rows)
926 max_deltas <- rep(NA_real_, n_rows)
927
928 if (n_rows > 0) {
929 for (i in seq_len(n_rows)) {
930 numeric_row <- as.numeric(pct_matrix[i, ])
931 if (all(is.na(numeric_row))) {
932 next
933 }
934
935 if (any(is.na(numeric_row))) {
936 numeric_row[is.na(numeric_row)] <- 0
937 }
938 pct_matrix[i, ] <- numeric_row
939
940 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
941 max_idx <- which(numeric_row == max_val)
942 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
943 winner_values[i] <- max_val
944
945 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
946 if (length(unique_vals) >= 2) {
947 runner_val <- unique_vals[2]
948 runner_idx <- which(numeric_row == runner_val)
949 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
950 runner_values[i] <- runner_val
951 }
952
953 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
954 min_idx <- which(numeric_row == min_val)
955 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
956 loser_values[i] <- min_val
957
958 if (is.finite(max_val) && is.finite(min_val)) {
959 max_deltas[i] <- max_val - min_val
960 }
961 }
962 }
963
964 comparison[, pct_cols] <- pct_matrix
965 comparison[[winner_pct_label_col]] <- winner_labels
966 comparison[[winner_pct_value_col]] <- winner_values
967 comparison[[runner_pct_label_col]] <- runner_labels
968 comparison[[runner_pct_value_col]] <- runner_values
969 comparison[[loser_pct_label_col]] <- loser_labels
970 comparison[[loser_pct_value_col]] <- loser_values
971 comparison[[max_delta_pct_col]] <- max_deltas
972 }
973
Marc Kupietzc4540a22025-10-14 17:39:53 +0200974 dplyr::left_join(result, comparison, by = c("node", "collocate"))
975}
976
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200977#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200978#' @importFrom stringr str_detect
979#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
980#'
981matches2FreqTable <- function(matches,
982 index = 0,
983 minOccur = 5,
984 leftContextSize = 5,
985 rightContextSize = 5,
986 ignoreCollocateCase = FALSE,
987 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200988 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200989 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
990 verbose = TRUE) {
991 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
992 frequency <- NULL
993
994 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200995 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200996 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200997 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200998 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200999 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
1000 ignoreCollocateCase = ignoreCollocateCase,
1001 stopwords = stopwords, oldTable = oldTable, verbose = verbose
1002 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001003 }
1004 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +02001005 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001006 oldTable <- matches2FreqTable(
1007 matches,
1008 i,
1009 leftContextSize = leftContextSize,
1010 rightContextSize = rightContextSize,
1011 collocateFilterRegex = collocateFilterRegex,
1012 oldTable = oldTable,
1013 stopwords = stopwords
1014 )
1015 }
1016 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001017 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001018 group_by(word) |>
1019 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001020 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001021 arrange(desc(frequency))
1022 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001023 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +02001024
1025 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
1026
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001027 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001028
1029 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
1030
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001031 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001032
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001033 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001034 oldTable
1035 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001036 table(c(left, right)) |>
1037 dplyr::as_tibble(.name_repair = "minimal") |>
1038 dplyr::rename(word = 1, frequency = 2) |>
1039 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001040 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001041 dplyr::bind_rows(oldTable)
1042 }
1043 }
1044}
1045
1046#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001047#' @importFrom stringr str_match str_split str_detect
1048#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1049#'
1050snippet2FreqTable <- function(snippet,
1051 minOccur = 5,
1052 leftContextSize = 5,
1053 rightContextSize = 5,
1054 ignoreCollocateCase = FALSE,
1055 stopwords = c(),
1056 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001057 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001058 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1059 verbose = TRUE) {
1060 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1061 frequency <- NULL
1062
1063 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001064 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001065 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +02001066 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001067 for (s in snippet) {
1068 oldTable <- snippet2FreqTable(
1069 s,
1070 leftContextSize = leftContextSize,
1071 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +01001072 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001073 oldTable = oldTable,
1074 stopwords = stopwords
1075 )
1076 }
Marc Kupietza47d1502023-04-18 15:26:47 +02001077 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001078 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001079 group_by(word) |>
1080 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001081 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001082 arrange(desc(frequency))
1083 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001084 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001085 match <-
1086 str_match(
1087 snippet,
1088 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
1089 )
1090
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001091 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001092 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001093 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001094 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001095 }
1096 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001097
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001098 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001099 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001100 } else {
1101 ""
1102 }
1103 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001104
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001105 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001106 oldTable
1107 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001108 table(c(left, right)) |>
1109 dplyr::as_tibble(.name_repair = "minimal") |>
1110 dplyr::rename(word = 1, frequency = 2) |>
1111 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001112 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001113 dplyr::bind_rows(oldTable)
1114 }
1115 }
1116}
1117
1118#' Preliminary synsemantic stopwords function
1119#'
1120#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001121#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001122#'
1123#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1124#'
1125#' @details
1126#' Currently only suitable for German. See stopwords package for other languages.
1127#'
1128#' @param ... future arguments for language detection
1129#'
1130#' @family collocation analysis functions
1131#' @return Vector of synsemantic stopwords.
1132#' @export
1133synsemanticStopwords <- function(...) {
Marc Kupietzc79155b2025-10-19 13:42:55 +02001134 base <- c(
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001135 "der",
1136 "die",
1137 "und",
1138 "in",
1139 "den",
1140 "von",
1141 "mit",
1142 "das",
1143 "zu",
1144 "im",
1145 "ist",
1146 "auf",
1147 "sich",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001148 "des",
1149 "dem",
1150 "nicht",
1151 "ein",
1152 "eine",
1153 "es",
1154 "auch",
1155 "an",
1156 "als",
1157 "am",
1158 "aus",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001159 "bei",
1160 "er",
1161 "dass",
1162 "sie",
1163 "nach",
1164 "um",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001165 "zum",
1166 "noch",
1167 "war",
1168 "einen",
1169 "einer",
1170 "wie",
1171 "einem",
1172 "vor",
1173 "bis",
1174 "\u00fcber",
1175 "so",
1176 "aber",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001177 "diese",
Marc Kupietzc79155b2025-10-19 13:42:55 +02001178 "oder"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001179 )
Marc Kupietzc79155b2025-10-19 13:42:55 +02001180
1181 lower <- unique(tolower(base))
1182 capitalized <- paste0(toupper(substr(lower, 1, 1)), substring(lower, 2))
1183
1184 unique(c(lower, capitalized))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001185}
1186
Marc Kupietz5a336b62021-11-27 17:51:35 +01001187
Marc Kupietz76b05592021-12-19 16:26:15 +01001188# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001189findExample <-
1190 function(kco,
1191 query,
1192 vc = "",
1193 matchOnly = TRUE) {
1194 out <- character(length = length(query))
1195
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001196 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001197 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001198 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001199
1200 for (i in seq_along(query)) {
1201 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001202 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001203 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001204 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001205 out[i] <- if (matchOnly) {
1206 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001207 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001208 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001209 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001210 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001211 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001212 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001213 }
1214 out
1215 }
1216
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001217collocatesQuery <-
1218 function(kco,
1219 query,
1220 vc = "",
1221 minOccur = 5,
1222 leftContextSize = 5,
1223 rightContextSize = 5,
1224 searchHitsSampleLimit = 20000,
1225 ignoreCollocateCase = FALSE,
1226 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001227 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001228 ...) {
1229 frequency <- NULL
1230 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001231 if (q@totalResults == 0) {
1232 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001233 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001234 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1235 matches2FreqTable(q@collectedMatches,
1236 0,
1237 minOccur = minOccur,
1238 leftContextSize = leftContextSize,
1239 rightContextSize = rightContextSize,
1240 ignoreCollocateCase = ignoreCollocateCase,
1241 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001242 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001243 ...,
1244 verbose = kco@verbose
1245 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001246 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001247 filter(frequency >= minOccur)
1248 }
1249 }