blob: d314b684be862e21ed8a4cd5043e26c3f8529b78 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzde679ea2025-10-19 13:14:51 +020045#' @param queryMissingScores if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
Marc Kupietz4cbb5472025-10-19 12:15:25 +020046#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietze34a8be2025-10-17 20:13:42 +020047#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020048#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020050#' @return
51#' A tibble where each row represents a candidate collocate for the requested node.
52#' Columns include (depending on the selected association measures):
53#'
54#' \itemize{
55#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
56#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
57#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
58#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
59#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
60#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
61#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
62#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
63#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020064#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020066#' @importFrom tidyr expand_grid pivot_wider
67#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020068#'
69#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020070#' \dontrun{
71#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020072#' # Find top collocates of "Packung" inside and outside the sports domain.
73#' KorAPConnection(verbose = TRUE) |>
74#' collocationAnalysis("Packung",
75#' vc = c("textClass=sport", "textClass!=sport"),
76#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
77#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020078#' dplyr::filter(logDice >= 5)
79#' }
80#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020081#' \dontrun{
82#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' # Identify the most prominent light verb construction with "in ... setzen".
84#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010085#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020087#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
88#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#' }
90#'
91#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020092setMethod(
93 "collocationAnalysis", "KorAPConnection",
94 function(kco,
95 node,
96 vc = "",
97 lemmatizeNodeQuery = FALSE,
98 minOccur = 5,
99 leftContextSize = 5,
100 rightContextSize = 5,
101 topCollocatesLimit = 200,
102 searchHitsSampleLimit = 20000,
103 ignoreCollocateCase = FALSE,
104 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
105 exactFrequencies = TRUE,
106 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
107 seed = 7,
108 expand = length(vc) != length(node),
109 maxRecurse = 0,
110 addExamples = FALSE,
111 thresholdScore = "logDice",
112 threshold = 2.0,
113 localStopwords = c(),
114 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzde679ea2025-10-19 13:14:51 +0200115 queryMissingScores = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200116 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200118 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200120
Marc Kupietzb2862d42025-10-18 10:17:49 +0200121 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200122 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
123 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200126
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200127 if (lemmatizeNodeQuery) {
128 node <- lemmatizeWordQuery(node)
129 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200130
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 if (is.null(vcNames)) {
133 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200134 }
135
136 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200137 if (!is.null(names(vc)) && length(vc) > 0) {
138 raw_names <- names(vc)
139 if (any(!is.na(raw_names) & raw_names != "")) {
140 label_lookup <- stats::setNames(raw_names, vc)
141 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200142 }
143
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200144 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200145 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200146 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
147 tmp_grid$vc <- vc[tmp_grid$idx]
148 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
149 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200150 } else {
151 tibble(node = node, vc = vc, vcLabel = vcNames)
152 }
153
154 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200155 collocationAnalysis(kco,
156 node = node,
157 vc = vc,
158 minOccur = minOccur,
159 leftContextSize = leftContextSize,
160 rightContextSize = rightContextSize,
161 topCollocatesLimit = topCollocatesLimit,
162 searchHitsSampleLimit = searchHitsSampleLimit,
163 ignoreCollocateCase = ignoreCollocateCase,
164 withinSpan = withinSpan,
165 exactFrequencies = exactFrequencies,
166 stopwords = stopwords,
167 addExamples = TRUE,
168 localStopwords = localStopwords,
169 seed = seed,
170 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200171 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200172 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200173 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200174 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200175 ...
176 )
177 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200178 bind_rows()
179
180 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
181 multi_result
182 } else {
Marc Kupietzde679ea2025-10-19 13:14:51 +0200183 if (queryMissingScores) {
184 multi_result <- backfill_missing_scores(
185 multi_result,
186 grid = grid,
187 kco = kco,
188 ignoreCollocateCase = ignoreCollocateCase,
189 ...
190 )
191 }
192
Marc Kupietze34a8be2025-10-17 20:13:42 +0200193 if (!"label" %in% names(multi_result)) {
194 multi_result$label <- NA_character_
195 }
196
197 if (!is.null(label_lookup)) {
198 override <- unname(label_lookup[multi_result$vc])
199 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
200 if (any(missing_idx)) {
201 multi_result$label[missing_idx] <- override[missing_idx]
202 }
203 }
204
205 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
206 if (any(missing_idx)) {
207 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
208 }
209
Marc Kupietze31322e2025-10-17 18:55:36 +0200210 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200211 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200212 missingScoreQuantile = missingScoreQuantile
213 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200214 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200215 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200216 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
217 vcLabel <- vcNames[1]
218 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200219
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200220 set.seed(seed)
221 candidates <- collocatesQuery(
222 kco,
223 node,
224 vc = vc,
225 minOccur = minOccur,
226 leftContextSize = leftContextSize,
227 rightContextSize = rightContextSize,
228 searchHitsSampleLimit = searchHitsSampleLimit,
229 ignoreCollocateCase = ignoreCollocateCase,
230 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200231 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200232 ...
233 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200234
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200235 if (nrow(candidates) > 0) {
236 candidates <- candidates |>
237 filter(frequency >= minOccur) |>
238 slice_head(n = topCollocatesLimit)
239 collocationScoreQuery(
240 kco,
241 node = node,
242 collocate = candidates$word,
243 vc = vc,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 observed = if (exactFrequencies) NA else candidates$frequency,
247 ignoreCollocateCase = ignoreCollocateCase,
248 withinSpan = withinSpan,
249 ...
250 ) |>
251 filter(O >= minOccur) |>
252 dplyr::arrange(dplyr::desc(logDice))
253 } else {
254 tibble()
255 }
256 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200257
258 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
259 result$label <- rep(vcLabel, nrow(result))
260 }
261
262 threshold_col <- thresholdScore
263 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
264 threshold_values <- result[[threshold_col]]
265 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
266 if (length(eligible_idx) > 0) {
267 recurseWith <- result[eligible_idx, , drop = FALSE]
268 result <- collocationAnalysis(
269 kco,
270 node = paste0("(", buildCollocationQuery(
271 removeWithinSpan(recurseWith$node, withinSpan),
272 recurseWith$collocate,
273 leftContextSize = leftContextSize,
274 rightContextSize = rightContextSize,
275 withinSpan = ""
276 ), ")"),
277 vc = vc,
278 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200279 leftContextSize = leftContextSize,
280 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200281 withinSpan = withinSpan,
282 maxRecurse = maxRecurse - 1,
283 stopwords = stopwords,
284 localStopwords = recurseWith$collocate,
285 exactFrequencies = exactFrequencies,
286 searchHitsSampleLimit = searchHitsSampleLimit,
287 topCollocatesLimit = topCollocatesLimit,
288 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200289 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200290 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200291 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200292 vcLabel = vcLabel
293 ) |>
294 bind_rows(result) |>
295 filter(logDice >= 2) |>
296 filter(O >= minOccur) |>
297 dplyr::arrange(dplyr::desc(logDice))
298 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200299 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200300
301 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200302 result$query <- buildCollocationQuery(
303 result$node,
304 result$collocate,
305 leftContextSize = leftContextSize,
306 rightContextSize = rightContextSize,
307 withinSpan = withinSpan
308 )
309 result$example <- findExample(
310 kco,
311 query = result$query,
312 vc = result$vc
313 )
314 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200315
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200316 result
317 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200318)
319
Marc Kupietz76b05592021-12-19 16:26:15 +0100320# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100321removeWithinSpan <- function(query, withinSpan) {
322 if (withinSpan == "") {
323 return(query)
324 }
325 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200326 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100327 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200328 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100329 return(res)
330}
331
Marc Kupietzde679ea2025-10-19 13:14:51 +0200332backfill_missing_scores <- function(result,
333 grid,
334 kco,
335 ignoreCollocateCase,
336 ...) {
337 if (!"vc" %in% names(result) || !"node" %in% names(result) || !"collocate" %in% names(result)) {
338 return(result)
339 }
340
341 if (nrow(result) == 0) {
342 return(result)
343 }
344
345 distinct_pairs <- dplyr::distinct(result, node, collocate)
346 if (nrow(distinct_pairs) == 0) {
347 return(result)
348 }
349
350 collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
351 if (length(collocates_by_node) == 0) {
352 return(result)
353 }
354
355 required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
356 for (i in seq_len(nrow(required_combinations))) {
357 node_value <- required_combinations$node[i]
358 vc_value <- required_combinations$vc[i]
359
360 collocate_pool <- collocates_by_node[[node_value]]
361 if (is.null(collocate_pool) || length(collocate_pool) == 0) {
362 next
363 }
364
365 existing_idx <- result$node == node_value & result$vc == vc_value
366 existing_collocates <- unique(as.character(result$collocate[existing_idx]))
367 missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
368 missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
369
370 if (length(missing_collocates) == 0) {
371 next
372 }
373
374 context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
375 if (nrow(context_rows) == 0) {
376 context_rows <- result[result$node == node_value, , drop = FALSE]
377 }
378
379 left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
380 if (is.na(left_size) || length(left_size) == 0) {
381 left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
382 }
383 if (is.na(left_size) || length(left_size) == 0) {
384 left_size <- 5
385 }
386
387 right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
388 if (is.na(right_size) || length(right_size) == 0) {
389 right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
390 }
391 if (is.na(right_size) || length(right_size) == 0) {
392 right_size <- 5
393 }
394
395 within_span_value <- ""
396 if ("query" %in% names(context_rows)) {
397 query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
398 if (!is.na(query_candidate) && nzchar(query_candidate)) {
399 match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
400 matches <- regmatches(query_candidate, match_one)
401 if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
402 within_span_value <- matches[[1]][2]
403 }
404 }
405 }
406
407 new_rows <- collocationScoreQuery(
408 kco,
409 node = node_value,
410 collocate = missing_collocates,
411 vc = vc_value,
412 leftContextSize = left_size,
413 rightContextSize = right_size,
414 ignoreCollocateCase = ignoreCollocateCase,
415 withinSpan = within_span_value,
416 ...
417 )
418
419 if (nrow(new_rows) == 0) {
420 next
421 }
422
423 if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
424 new_rows$label <- required_combinations$vcLabel[i]
425 }
426
427 result <- dplyr::bind_rows(result, new_rows)
428 }
429
430 result
431}
432
Marc Kupietz77852b22025-10-19 11:35:34 +0200433add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200434 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200435
436 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
437 return(result)
438 }
439
440 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
441 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
442 score_cols <- setdiff(numeric_cols, non_score_cols)
443
444 if (length(score_cols) == 0) {
445 return(result)
446 }
447
Marc Kupietz9894a372025-10-18 14:51:29 +0200448 compute_score_floor <- function(values) {
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200449 # Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz9894a372025-10-18 14:51:29 +0200450 finite_values <- values[is.finite(values)]
451 if (length(finite_values) == 0) {
452 return(0)
453 }
454
455 prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200456 # Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200457 q_val <- suppressWarnings(stats::quantile(finite_values,
458 probs = prob,
459 names = FALSE,
460 type = 7
461 ))
462
463 if (!is.finite(q_val)) {
464 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
465 }
466
467 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
468 if (!is.finite(min_val)) {
469 min_val <- 0
470 }
471
472 spread_candidates <- c(
473 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
474 stats::sd(finite_values, na.rm = TRUE),
475 abs(q_val) * 0.1,
476 abs(min_val - q_val)
477 )
478 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
479
480 spread <- 0
481 if (length(spread_candidates) > 0) {
482 spread <- max(spread_candidates)
483 }
484 if (!is.finite(spread) || spread == 0) {
485 spread <- max(abs(q_val), abs(min_val), 1e-06)
486 }
487
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200488 # Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200489 candidate <- q_val - spread
490 if (!is.finite(candidate)) {
491 candidate <- min_val
492 }
493
494 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
495 if (!is.finite(floor_value)) {
496 floor_value <- min_val
497 }
498 if (!is.finite(floor_value)) {
499 floor_value <- 0
500 }
501
502 floor_value
503 }
504
505 score_replacements <- stats::setNames(
506 vapply(score_cols, function(col) {
507 compute_score_floor(result[[col]])
508 }, numeric(1)),
509 score_cols
510 )
511
Marc Kupietzc4540a22025-10-14 17:39:53 +0200512 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200513 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
514 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200515 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200516 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200517 names_glue = "{.value}_{make.names(label)}",
518 values_fn = dplyr::first
519 )
520
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200521 raw_labels <- unique(result$label)
522 labels <- make.names(raw_labels)
523 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200524
Marc Kupietz28a29842025-10-18 12:25:09 +0200525 rank_data <- result |>
526 dplyr::distinct(node, collocate)
527
528 for (i in seq_along(raw_labels)) {
529 raw_lab <- raw_labels[i]
530 safe_lab <- labels[i]
531 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
532 if (nrow(label_df) == 0) {
533 next
534 }
535 label_df <- dplyr::distinct(label_df)
536 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
537 for (col in score_cols) {
538 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200539 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200540 values <- label_df[[col]]
541 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200542 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200543 valid_idx <- which(!is.na(values))
544 if (length(valid_idx) > 0) {
545 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200546 total <- length(valid_idx)
547 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200548 }
549 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200550 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200551 }
552 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
553 }
554
555 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
556
557 rank_replacements <- numeric(0)
558 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
559 if (length(rank_column_names) > 0) {
560 rank_replacements <- stats::setNames(
561 vapply(rank_column_names, function(col) {
562 col_values <- comparison[[col]]
563 valid_values <- col_values[!is.na(col_values)]
564 if (length(valid_values) == 0) {
565 nrow(comparison) + 1
566 } else {
567 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
568 }
569 }, numeric(1)),
570 rank_column_names
571 )
572 }
573
Marc Kupietz130a2a22025-10-18 16:09:23 +0200574 percentile_replacements <- numeric(0)
575 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
576 if (length(percentile_column_names) > 0) {
577 percentile_replacements <- stats::setNames(
578 rep(0, length(percentile_column_names)),
579 percentile_column_names
580 )
581 }
582
Marc Kupietz28a29842025-10-18 12:25:09 +0200583 collapse_label_values <- function(indices, safe_labels_vec) {
584 if (length(indices) == 0) {
585 return(NA_character_)
586 }
587 labs <- label_map[safe_labels_vec[indices]]
588 fallback <- safe_labels_vec[indices]
589 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
590 labs <- labs[!is.na(labs) & labs != ""]
591 if (length(labs) == 0) {
592 return(NA_character_)
593 }
594 paste(unique(labs), collapse = ", ")
595 }
596
Marc Kupietzc4540a22025-10-14 17:39:53 +0200597 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200598 fill_scores <- function(x, y, measure_col) {
599 replacement <- score_replacements[[measure_col]]
600 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
601 if (!is.finite(fallback_min)) {
602 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200603 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200604 if (!is.null(replacement) && is.finite(replacement)) {
605 replacement <- min(replacement, fallback_min)
606 } else {
607 replacement <- fallback_min
608 }
609 if (!is.finite(replacement)) {
610 replacement <- 0
611 }
612 if (any(is.na(x))) {
613 x[is.na(x)] <- replacement
614 }
615 if (any(is.na(y))) {
616 y[is.na(y)] <- replacement
617 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200618 list(x = x, y = y)
619 }
620
Marc Kupietz130a2a22025-10-18 16:09:23 +0200621 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
622 replacement_left <- percentile_replacements[[left_pct_col]]
623 if (is.null(replacement_left) || !is.finite(replacement_left)) {
624 replacement_left <- 0
625 }
626 replacement_right <- percentile_replacements[[right_pct_col]]
627 if (is.null(replacement_right) || !is.finite(replacement_right)) {
628 replacement_right <- 0
629 }
630 if (any(is.na(x))) {
631 x[is.na(x)] <- replacement_left
632 }
633 if (any(is.na(y))) {
634 y[is.na(y)] <- replacement_right
635 }
636 list(x = x, y = y)
637 }
638
Marc Kupietz28a29842025-10-18 12:25:09 +0200639 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
640 fallback <- nrow(comparison) + 1
641 replacement_left <- rank_replacements[[left_rank_col]]
642 if (is.null(replacement_left) || !is.finite(replacement_left)) {
643 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200644 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200645 replacement_right <- rank_replacements[[right_rank_col]]
646 if (is.null(replacement_right) || !is.finite(replacement_right)) {
647 replacement_right <- fallback
648 }
649 if (any(is.na(x))) {
650 x[is.na(x)] <- replacement_left
651 }
652 if (any(is.na(y))) {
653 y[is.na(y)] <- replacement_right
654 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200655 list(x = x, y = y)
656 }
657
658 left_label <- labels[1]
659 right_label <- labels[2]
660
661 for (col in score_cols) {
662 left_col <- paste0(col, "_", left_label)
663 right_col <- paste0(col, "_", right_label)
664 if (!all(c(left_col, right_col) %in% names(comparison))) {
665 next
666 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200667 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200668 comparison[[left_col]] <- filled$x
669 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200670 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200671 rank_left <- paste0("rank_", left_label, "_", col)
672 rank_right <- paste0("rank_", right_label, "_", col)
673 if (all(c(rank_left, rank_right) %in% names(comparison))) {
674 filled_rank <- fill_ranks(
675 comparison[[rank_left]],
676 comparison[[rank_right]],
677 rank_left,
678 rank_right
679 )
680 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
681 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200682 pct_left <- paste0("percentile_rank_", left_label, "_", col)
683 pct_right <- paste0("percentile_rank_", right_label, "_", col)
684 if (all(c(pct_left, pct_right) %in% names(comparison))) {
685 filled_pct <- fill_percentiles(
686 comparison[[pct_left]],
687 comparison[[pct_right]],
688 pct_left,
689 pct_right
690 )
691 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
692 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200693 }
694 }
695
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200696 for (col in score_cols) {
697 value_cols <- paste0(col, "_", labels)
698 existing <- value_cols %in% names(comparison)
699 if (!any(existing)) {
700 next
701 }
702 value_cols <- value_cols[existing]
703 safe_labels <- labels[existing]
704
705 score_values <- comparison[, value_cols, drop = FALSE]
706
707 winner_label_col <- paste0("winner_", col)
708 winner_value_col <- paste0("winner_", col, "_value")
709 runner_label_col <- paste0("runner_up_", col)
710 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200711 loser_label_col <- paste0("loser_", col)
712 loser_value_col <- paste0("loser_", col, "_value")
713 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200714
715 if (nrow(score_values) == 0) {
716 comparison[[winner_label_col]] <- character(0)
717 comparison[[winner_value_col]] <- numeric(0)
718 comparison[[runner_label_col]] <- character(0)
719 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200720 comparison[[loser_label_col]] <- character(0)
721 comparison[[loser_value_col]] <- numeric(0)
722 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200723 next
724 }
725
726 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200727 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200728
Marc Kupietzb2862d42025-10-18 10:17:49 +0200729 n_rows <- nrow(score_matrix)
730 winner_labels <- rep(NA_character_, n_rows)
731 winner_values <- rep(NA_real_, n_rows)
732 runner_labels <- rep(NA_character_, n_rows)
733 runner_values <- rep(NA_real_, n_rows)
734 loser_labels <- rep(NA_character_, n_rows)
735 loser_values <- rep(NA_real_, n_rows)
736 max_deltas <- rep(NA_real_, n_rows)
737
Marc Kupietzb2862d42025-10-18 10:17:49 +0200738 if (n_rows > 0) {
739 for (i in seq_len(n_rows)) {
740 numeric_row <- as.numeric(score_matrix[i, ])
741 if (all(is.na(numeric_row))) {
742 next
743 }
744
Marc Kupietz9894a372025-10-18 14:51:29 +0200745 replacement <- score_replacements[[col]]
746 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
747 if (!is.finite(fallback_min)) {
748 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200749 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200750 if (!is.null(replacement) && is.finite(replacement)) {
751 replacement <- min(replacement, fallback_min)
752 } else {
753 replacement <- fallback_min
754 }
755 if (!is.finite(replacement)) {
756 replacement <- 0
757 }
758 if (any(is.na(numeric_row))) {
759 numeric_row[is.na(numeric_row)] <- replacement
760 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200761 score_matrix[i, ] <- numeric_row
762
763 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
764 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200765 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200766 winner_values[i] <- max_val
767
768 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
769 if (length(unique_vals) >= 2) {
770 runner_val <- unique_vals[2]
771 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200772 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200773 runner_values[i] <- runner_val
774 }
775
776 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
777 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200778 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200779 loser_values[i] <- min_val
780
781 if (is.finite(max_val) && is.finite(min_val)) {
782 max_deltas[i] <- max_val - min_val
783 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200784 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200785 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200786
Marc Kupietzb2862d42025-10-18 10:17:49 +0200787 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200788 comparison[[winner_label_col]] <- winner_labels
789 comparison[[winner_value_col]] <- winner_values
790 comparison[[runner_label_col]] <- runner_labels
791 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200792 comparison[[loser_label_col]] <- loser_labels
793 comparison[[loser_value_col]] <- loser_values
794 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200795 }
796
Marc Kupietz28a29842025-10-18 12:25:09 +0200797 for (col in score_cols) {
798 rank_cols <- paste0("rank_", labels, "_", col)
799 existing <- rank_cols %in% names(comparison)
800 if (!any(existing)) {
801 next
802 }
803 rank_cols <- rank_cols[existing]
804 safe_labels <- labels[existing]
805 rank_values <- comparison[, rank_cols, drop = FALSE]
806
807 winner_rank_label_col <- paste0("winner_rank_", col)
808 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
809 runner_rank_label_col <- paste0("runner_up_rank_", col)
810 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
811 loser_rank_label_col <- paste0("loser_rank_", col)
812 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
813 max_delta_rank_col <- paste0("max_delta_rank_", col)
814
815 if (nrow(rank_values) == 0) {
816 comparison[[winner_rank_label_col]] <- character(0)
817 comparison[[winner_rank_value_col]] <- numeric(0)
818 comparison[[runner_rank_label_col]] <- character(0)
819 comparison[[runner_rank_value_col]] <- numeric(0)
820 comparison[[loser_rank_label_col]] <- character(0)
821 comparison[[loser_rank_value_col]] <- numeric(0)
822 comparison[[max_delta_rank_col]] <- numeric(0)
823 next
824 }
825
826 rank_matrix <- as.matrix(rank_values)
827 storage.mode(rank_matrix) <- "numeric"
828
829 n_rows <- nrow(rank_matrix)
830 winner_labels <- rep(NA_character_, n_rows)
831 winner_values <- rep(NA_real_, n_rows)
832 runner_labels <- rep(NA_character_, n_rows)
833 runner_values <- rep(NA_real_, n_rows)
834 loser_labels <- rep(NA_character_, n_rows)
835 loser_values <- rep(NA_real_, n_rows)
836 max_deltas <- rep(NA_real_, n_rows)
837
838 for (i in seq_len(n_rows)) {
839 numeric_row <- as.numeric(rank_matrix[i, ])
840 if (all(is.na(numeric_row))) {
841 next
842 }
843
844 if (length(rank_cols) > 0) {
845 replacement_vec <- rank_replacements[rank_cols]
846 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
847 missing_idx <- which(is.na(numeric_row))
848 if (length(missing_idx) > 0) {
849 numeric_row[missing_idx] <- replacement_vec[missing_idx]
850 }
851 }
852
853 valid_idx <- seq_along(numeric_row)
854 valid_values <- numeric_row[valid_idx]
855 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
856 min_positions <- valid_idx[which(valid_values == min_val)]
857 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
858 winner_values[i] <- min_val
859
860 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
861 if (length(ordered_vals) >= 2) {
862 runner_val <- ordered_vals[2]
863 runner_positions <- valid_idx[which(valid_values == runner_val)]
864 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
865 runner_values[i] <- runner_val
866 }
867
868 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
869 max_positions <- valid_idx[which(valid_values == max_val)]
870 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
871 loser_values[i] <- max_val
872
873 if (is.finite(max_val) && is.finite(min_val)) {
874 max_deltas[i] <- max_val - min_val
875 }
876 }
877
878 comparison[[winner_rank_label_col]] <- winner_labels
879 comparison[[winner_rank_value_col]] <- winner_values
880 comparison[[runner_rank_label_col]] <- runner_labels
881 comparison[[runner_rank_value_col]] <- runner_values
882 comparison[[loser_rank_label_col]] <- loser_labels
883 comparison[[loser_rank_value_col]] <- loser_values
884 comparison[[max_delta_rank_col]] <- max_deltas
885 }
886
Marc Kupietz130a2a22025-10-18 16:09:23 +0200887 for (col in score_cols) {
888 pct_cols <- paste0("percentile_rank_", labels, "_", col)
889 existing <- pct_cols %in% names(comparison)
890 if (!any(existing)) {
891 next
892 }
893 pct_cols <- pct_cols[existing]
894 safe_labels <- labels[existing]
895 pct_values <- comparison[, pct_cols, drop = FALSE]
896
897 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
898 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
899 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
900 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
901 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
902 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
903 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
904
905 if (nrow(pct_values) == 0) {
906 comparison[[winner_pct_label_col]] <- character(0)
907 comparison[[winner_pct_value_col]] <- numeric(0)
908 comparison[[runner_pct_label_col]] <- character(0)
909 comparison[[runner_pct_value_col]] <- numeric(0)
910 comparison[[loser_pct_label_col]] <- character(0)
911 comparison[[loser_pct_value_col]] <- numeric(0)
912 comparison[[max_delta_pct_col]] <- numeric(0)
913 next
914 }
915
916 pct_matrix <- as.matrix(pct_values)
917 storage.mode(pct_matrix) <- "numeric"
918
919 n_rows <- nrow(pct_matrix)
920 winner_labels <- rep(NA_character_, n_rows)
921 winner_values <- rep(NA_real_, n_rows)
922 runner_labels <- rep(NA_character_, n_rows)
923 runner_values <- rep(NA_real_, n_rows)
924 loser_labels <- rep(NA_character_, n_rows)
925 loser_values <- rep(NA_real_, n_rows)
926 max_deltas <- rep(NA_real_, n_rows)
927
928 if (n_rows > 0) {
929 for (i in seq_len(n_rows)) {
930 numeric_row <- as.numeric(pct_matrix[i, ])
931 if (all(is.na(numeric_row))) {
932 next
933 }
934
935 if (any(is.na(numeric_row))) {
936 numeric_row[is.na(numeric_row)] <- 0
937 }
938 pct_matrix[i, ] <- numeric_row
939
940 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
941 max_idx <- which(numeric_row == max_val)
942 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
943 winner_values[i] <- max_val
944
945 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
946 if (length(unique_vals) >= 2) {
947 runner_val <- unique_vals[2]
948 runner_idx <- which(numeric_row == runner_val)
949 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
950 runner_values[i] <- runner_val
951 }
952
953 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
954 min_idx <- which(numeric_row == min_val)
955 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
956 loser_values[i] <- min_val
957
958 if (is.finite(max_val) && is.finite(min_val)) {
959 max_deltas[i] <- max_val - min_val
960 }
961 }
962 }
963
964 comparison[, pct_cols] <- pct_matrix
965 comparison[[winner_pct_label_col]] <- winner_labels
966 comparison[[winner_pct_value_col]] <- winner_values
967 comparison[[runner_pct_label_col]] <- runner_labels
968 comparison[[runner_pct_value_col]] <- runner_values
969 comparison[[loser_pct_label_col]] <- loser_labels
970 comparison[[loser_pct_value_col]] <- loser_values
971 comparison[[max_delta_pct_col]] <- max_deltas
972 }
973
Marc Kupietzc4540a22025-10-14 17:39:53 +0200974 dplyr::left_join(result, comparison, by = c("node", "collocate"))
975}
976
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200977#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200978#' @importFrom stringr str_detect
979#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
980#'
981matches2FreqTable <- function(matches,
982 index = 0,
983 minOccur = 5,
984 leftContextSize = 5,
985 rightContextSize = 5,
986 ignoreCollocateCase = FALSE,
987 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200988 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200989 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
990 verbose = TRUE) {
991 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
992 frequency <- NULL
993
994 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200995 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200996 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200997 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200998 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200999 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
1000 ignoreCollocateCase = ignoreCollocateCase,
1001 stopwords = stopwords, oldTable = oldTable, verbose = verbose
1002 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001003 }
1004 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +02001005 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001006 oldTable <- matches2FreqTable(
1007 matches,
1008 i,
1009 leftContextSize = leftContextSize,
1010 rightContextSize = rightContextSize,
1011 collocateFilterRegex = collocateFilterRegex,
1012 oldTable = oldTable,
1013 stopwords = stopwords
1014 )
1015 }
1016 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001017 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001018 group_by(word) |>
1019 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001020 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001021 arrange(desc(frequency))
1022 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001023 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +02001024
1025 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
1026
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001027 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001028
1029 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
1030
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001031 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001032
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001033 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001034 oldTable
1035 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001036 table(c(left, right)) |>
1037 dplyr::as_tibble(.name_repair = "minimal") |>
1038 dplyr::rename(word = 1, frequency = 2) |>
1039 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001040 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001041 dplyr::bind_rows(oldTable)
1042 }
1043 }
1044}
1045
1046#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001047#' @importFrom stringr str_match str_split str_detect
1048#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1049#'
1050snippet2FreqTable <- function(snippet,
1051 minOccur = 5,
1052 leftContextSize = 5,
1053 rightContextSize = 5,
1054 ignoreCollocateCase = FALSE,
1055 stopwords = c(),
1056 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001057 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001058 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1059 verbose = TRUE) {
1060 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1061 frequency <- NULL
1062
1063 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001064 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001065 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +02001066 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001067 for (s in snippet) {
1068 oldTable <- snippet2FreqTable(
1069 s,
1070 leftContextSize = leftContextSize,
1071 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +01001072 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001073 oldTable = oldTable,
1074 stopwords = stopwords
1075 )
1076 }
Marc Kupietza47d1502023-04-18 15:26:47 +02001077 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001078 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001079 group_by(word) |>
1080 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001081 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001082 arrange(desc(frequency))
1083 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001084 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001085 match <-
1086 str_match(
1087 snippet,
1088 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
1089 )
1090
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001091 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001092 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001093 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001094 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001095 }
1096 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001097
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001098 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001099 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001100 } else {
1101 ""
1102 }
1103 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001104
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001105 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001106 oldTable
1107 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001108 table(c(left, right)) |>
1109 dplyr::as_tibble(.name_repair = "minimal") |>
1110 dplyr::rename(word = 1, frequency = 2) |>
1111 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001112 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001113 dplyr::bind_rows(oldTable)
1114 }
1115 }
1116}
1117
1118#' Preliminary synsemantic stopwords function
1119#'
1120#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001121#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001122#'
1123#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1124#'
1125#' @details
1126#' Currently only suitable for German. See stopwords package for other languages.
1127#'
1128#' @param ... future arguments for language detection
1129#'
1130#' @family collocation analysis functions
1131#' @return Vector of synsemantic stopwords.
1132#' @export
1133synsemanticStopwords <- function(...) {
1134 res <- c(
1135 "der",
1136 "die",
1137 "und",
1138 "in",
1139 "den",
1140 "von",
1141 "mit",
1142 "das",
1143 "zu",
1144 "im",
1145 "ist",
1146 "auf",
1147 "sich",
1148 "Die",
1149 "des",
1150 "dem",
1151 "nicht",
1152 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001153 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001154 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +01001155 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001156 "es",
1157 "auch",
1158 "an",
1159 "als",
1160 "am",
1161 "aus",
1162 "Der",
1163 "bei",
1164 "er",
1165 "dass",
1166 "sie",
1167 "nach",
1168 "um",
1169 "Das",
1170 "zum",
1171 "noch",
1172 "war",
1173 "einen",
1174 "einer",
1175 "wie",
1176 "einem",
1177 "vor",
1178 "bis",
1179 "\u00fcber",
1180 "so",
1181 "aber",
1182 "Eine",
1183 "diese",
1184 "Diese",
Marc Kupietz130a2a22025-10-18 16:09:23 +02001185 "oder",
1186 "Es",
1187 "Und"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001188 )
1189 return(res)
1190}
1191
Marc Kupietz5a336b62021-11-27 17:51:35 +01001192
Marc Kupietz76b05592021-12-19 16:26:15 +01001193# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001194findExample <-
1195 function(kco,
1196 query,
1197 vc = "",
1198 matchOnly = TRUE) {
1199 out <- character(length = length(query))
1200
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001201 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001202 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001203 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001204
1205 for (i in seq_along(query)) {
1206 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001207 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001208 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001209 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001210 out[i] <- if (matchOnly) {
1211 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001212 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001213 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001214 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001215 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001216 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001217 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001218 }
1219 out
1220 }
1221
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001222collocatesQuery <-
1223 function(kco,
1224 query,
1225 vc = "",
1226 minOccur = 5,
1227 leftContextSize = 5,
1228 rightContextSize = 5,
1229 searchHitsSampleLimit = 20000,
1230 ignoreCollocateCase = FALSE,
1231 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001232 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001233 ...) {
1234 frequency <- NULL
1235 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001236 if (q@totalResults == 0) {
1237 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001238 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001239 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1240 matches2FreqTable(q@collectedMatches,
1241 0,
1242 minOccur = minOccur,
1243 leftContextSize = leftContextSize,
1244 rightContextSize = rightContextSize,
1245 ignoreCollocateCase = ignoreCollocateCase,
1246 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001247 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001248 ...,
1249 verbose = kco@verbose
1250 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001251 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001252 filter(frequency >= minOccur)
1253 }
1254 }