blob: ab8d2f8f02584342c53e96bd03ea0048741eb8e1 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzde679ea2025-10-19 13:14:51 +020045#' @param queryMissingScores if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
Marc Kupietz4cbb5472025-10-19 12:15:25 +020046#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietze34a8be2025-10-17 20:13:42 +020047#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020048#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020050#' @return
51#' A tibble where each row represents a candidate collocate for the requested node.
52#' Columns include (depending on the selected association measures):
53#'
54#' \itemize{
55#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
56#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
57#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
58#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
59#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
60#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
61#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
62#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
63#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020064#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020066#' @importFrom tidyr expand_grid pivot_wider
67#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020068#'
69#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020070#' \dontrun{
71#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020072#' # Find top collocates of "Packung" inside and outside the sports domain.
73#' KorAPConnection(verbose = TRUE) |>
74#' collocationAnalysis("Packung",
75#' vc = c("textClass=sport", "textClass!=sport"),
76#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
77#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020078#' dplyr::filter(logDice >= 5)
79#' }
80#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020081#' \dontrun{
82#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' # Identify the most prominent light verb construction with "in ... setzen".
84#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010085#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020087#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
88#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#' }
90#'
91#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020092setMethod(
93 "collocationAnalysis", "KorAPConnection",
94 function(kco,
95 node,
96 vc = "",
97 lemmatizeNodeQuery = FALSE,
98 minOccur = 5,
99 leftContextSize = 5,
100 rightContextSize = 5,
101 topCollocatesLimit = 200,
102 searchHitsSampleLimit = 20000,
103 ignoreCollocateCase = FALSE,
104 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
105 exactFrequencies = TRUE,
106 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
107 seed = 7,
108 expand = length(vc) != length(node),
109 maxRecurse = 0,
110 addExamples = FALSE,
111 thresholdScore = "logDice",
112 threshold = 2.0,
113 localStopwords = c(),
114 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzde679ea2025-10-19 13:14:51 +0200115 queryMissingScores = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200116 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200118 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200120
Marc Kupietzb2862d42025-10-18 10:17:49 +0200121 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200122 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
123 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200126
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200127 if (lemmatizeNodeQuery) {
128 node <- lemmatizeWordQuery(node)
129 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200130
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 if (is.null(vcNames)) {
133 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200134 }
135
136 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200137 if (!is.null(names(vc)) && length(vc) > 0) {
138 raw_names <- names(vc)
139 if (any(!is.na(raw_names) & raw_names != "")) {
140 label_lookup <- stats::setNames(raw_names, vc)
141 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200142 }
143
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200144 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200145 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200146 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
147 tmp_grid$vc <- vc[tmp_grid$idx]
148 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
149 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200150 } else {
151 tibble(node = node, vc = vc, vcLabel = vcNames)
152 }
153
154 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200155 collocationAnalysis(kco,
156 node = node,
157 vc = vc,
158 minOccur = minOccur,
159 leftContextSize = leftContextSize,
160 rightContextSize = rightContextSize,
161 topCollocatesLimit = topCollocatesLimit,
162 searchHitsSampleLimit = searchHitsSampleLimit,
163 ignoreCollocateCase = ignoreCollocateCase,
164 withinSpan = withinSpan,
165 exactFrequencies = exactFrequencies,
166 stopwords = stopwords,
167 addExamples = TRUE,
168 localStopwords = localStopwords,
169 seed = seed,
170 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200171 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200172 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200173 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200174 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200175 ...
176 )
177 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200178 bind_rows()
179
180 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
181 multi_result
182 } else {
Marc Kupietzde679ea2025-10-19 13:14:51 +0200183 if (queryMissingScores) {
184 multi_result <- backfill_missing_scores(
185 multi_result,
186 grid = grid,
187 kco = kco,
188 ignoreCollocateCase = ignoreCollocateCase,
189 ...
190 )
191 }
192
Marc Kupietze34a8be2025-10-17 20:13:42 +0200193 if (!"label" %in% names(multi_result)) {
194 multi_result$label <- NA_character_
195 }
196
197 if (!is.null(label_lookup)) {
198 override <- unname(label_lookup[multi_result$vc])
199 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
200 if (any(missing_idx)) {
201 multi_result$label[missing_idx] <- override[missing_idx]
202 }
203 }
204
205 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
206 if (any(missing_idx)) {
207 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
208 }
209
Marc Kupietze31322e2025-10-17 18:55:36 +0200210 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200211 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200212 missingScoreQuantile = missingScoreQuantile
213 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200214 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200215 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200216 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
217 vcLabel <- vcNames[1]
218 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200219
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200220 set.seed(seed)
221 candidates <- collocatesQuery(
222 kco,
223 node,
224 vc = vc,
225 minOccur = minOccur,
226 leftContextSize = leftContextSize,
227 rightContextSize = rightContextSize,
228 searchHitsSampleLimit = searchHitsSampleLimit,
229 ignoreCollocateCase = ignoreCollocateCase,
230 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200231 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200232 ...
233 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200234
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200235 if (nrow(candidates) > 0) {
236 candidates <- candidates |>
237 filter(frequency >= minOccur) |>
238 slice_head(n = topCollocatesLimit)
239 collocationScoreQuery(
240 kco,
241 node = node,
242 collocate = candidates$word,
243 vc = vc,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 observed = if (exactFrequencies) NA else candidates$frequency,
247 ignoreCollocateCase = ignoreCollocateCase,
248 withinSpan = withinSpan,
249 ...
250 ) |>
251 filter(O >= minOccur) |>
252 dplyr::arrange(dplyr::desc(logDice))
253 } else {
254 tibble()
255 }
256 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200257
258 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
259 result$label <- rep(vcLabel, nrow(result))
260 }
261
262 threshold_col <- thresholdScore
263 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
264 threshold_values <- result[[threshold_col]]
265 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
266 if (length(eligible_idx) > 0) {
267 recurseWith <- result[eligible_idx, , drop = FALSE]
268 result <- collocationAnalysis(
269 kco,
270 node = paste0("(", buildCollocationQuery(
271 removeWithinSpan(recurseWith$node, withinSpan),
272 recurseWith$collocate,
273 leftContextSize = leftContextSize,
274 rightContextSize = rightContextSize,
275 withinSpan = ""
276 ), ")"),
277 vc = vc,
278 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200279 leftContextSize = leftContextSize,
280 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200281 withinSpan = withinSpan,
282 maxRecurse = maxRecurse - 1,
283 stopwords = stopwords,
284 localStopwords = recurseWith$collocate,
285 exactFrequencies = exactFrequencies,
286 searchHitsSampleLimit = searchHitsSampleLimit,
287 topCollocatesLimit = topCollocatesLimit,
288 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200289 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200290 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200291 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200292 vcLabel = vcLabel
293 ) |>
294 bind_rows(result) |>
295 filter(logDice >= 2) |>
296 filter(O >= minOccur) |>
297 dplyr::arrange(dplyr::desc(logDice))
298 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200299 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200300
301 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200302 result$query <- buildCollocationQuery(
303 result$node,
304 result$collocate,
305 leftContextSize = leftContextSize,
306 rightContextSize = rightContextSize,
307 withinSpan = withinSpan
308 )
309 result$example <- findExample(
310 kco,
311 query = result$query,
312 vc = result$vc
313 )
314 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200315
Marc Kupietz0a292632025-10-19 14:04:36 +0200316 if (!is.null(withinSpan) && !is.na(withinSpan) && nzchar(withinSpan) &&
317 nrow(result) > 0 &&
318 "webUIRequestUrl" %in% names(result) &&
319 "query" %in% names(result)) {
320 candidate_rows <- which(!is.na(result$node) &
321 !grepl("focus\\(", result$node, perl = TRUE) &
322 !is.na(result$query) & nzchar(result$query))
323
324 if (length(candidate_rows) > 0) {
325 focused_queries <- vapply(
326 result$query[candidate_rows],
327 inject_focus_into_query,
328 character(1)
329 )
330
331 changed <- focused_queries != result$query[candidate_rows]
332 if (any(changed)) {
333 indices <- candidate_rows[changed]
334 vc_values <- as.character(result$vc)
335 vc_values[is.na(vc_values)] <- ""
336
337 result$webUIRequestUrl[indices] <- mapply(
338 function(new_query, vc_value) {
339 buildWebUIRequestUrlFromString(
340 kco@KorAPUrl,
341 new_query,
342 vc = vc_value,
343 ql = "poliqarp"
344 )
345 },
346 focused_queries[changed],
347 vc_values[indices],
348 USE.NAMES = FALSE
349 )
350 }
351 }
352 }
353
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200354 result
355 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200356)
357
Marc Kupietz76b05592021-12-19 16:26:15 +0100358# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100359removeWithinSpan <- function(query, withinSpan) {
360 if (withinSpan == "") {
361 return(query)
362 }
363 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200364 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100365 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200366 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100367 return(res)
368}
369
Marc Kupietzde679ea2025-10-19 13:14:51 +0200370backfill_missing_scores <- function(result,
371 grid,
372 kco,
373 ignoreCollocateCase,
374 ...) {
375 if (!"vc" %in% names(result) || !"node" %in% names(result) || !"collocate" %in% names(result)) {
376 return(result)
377 }
378
379 if (nrow(result) == 0) {
380 return(result)
381 }
382
383 distinct_pairs <- dplyr::distinct(result, node, collocate)
384 if (nrow(distinct_pairs) == 0) {
385 return(result)
386 }
387
388 collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
389 if (length(collocates_by_node) == 0) {
390 return(result)
391 }
392
393 required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
394 for (i in seq_len(nrow(required_combinations))) {
395 node_value <- required_combinations$node[i]
396 vc_value <- required_combinations$vc[i]
397
398 collocate_pool <- collocates_by_node[[node_value]]
399 if (is.null(collocate_pool) || length(collocate_pool) == 0) {
400 next
401 }
402
403 existing_idx <- result$node == node_value & result$vc == vc_value
404 existing_collocates <- unique(as.character(result$collocate[existing_idx]))
405 missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
406 missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
407
408 if (length(missing_collocates) == 0) {
409 next
410 }
411
412 context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
413 if (nrow(context_rows) == 0) {
414 context_rows <- result[result$node == node_value, , drop = FALSE]
415 }
416
417 left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
418 if (is.na(left_size) || length(left_size) == 0) {
419 left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
420 }
421 if (is.na(left_size) || length(left_size) == 0) {
422 left_size <- 5
423 }
424
425 right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
426 if (is.na(right_size) || length(right_size) == 0) {
427 right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
428 }
429 if (is.na(right_size) || length(right_size) == 0) {
430 right_size <- 5
431 }
432
433 within_span_value <- ""
434 if ("query" %in% names(context_rows)) {
435 query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
436 if (!is.na(query_candidate) && nzchar(query_candidate)) {
437 match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
438 matches <- regmatches(query_candidate, match_one)
439 if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
440 within_span_value <- matches[[1]][2]
441 }
442 }
443 }
444
445 new_rows <- collocationScoreQuery(
446 kco,
447 node = node_value,
448 collocate = missing_collocates,
449 vc = vc_value,
450 leftContextSize = left_size,
451 rightContextSize = right_size,
452 ignoreCollocateCase = ignoreCollocateCase,
453 withinSpan = within_span_value,
454 ...
455 )
456
457 if (nrow(new_rows) == 0) {
458 next
459 }
460
461 if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
462 new_rows$label <- required_combinations$vcLabel[i]
463 }
464
465 result <- dplyr::bind_rows(result, new_rows)
466 }
467
468 result
469}
470
Marc Kupietz0a292632025-10-19 14:04:36 +0200471inject_focus_into_query <- function(query) {
472 if (is.null(query) || is.na(query)) {
473 return(query)
474 }
475
476 trimmed <- trimws(query)
477 if (!nzchar(trimmed)) {
478 return(query)
479 }
480
481 if (!grepl("^contains\\(<[^>]+>", trimmed, perl = TRUE)) {
482 return(query)
483 }
484
485 if (grepl("focus\\(", trimmed, perl = TRUE)) {
486 return(query)
487 }
488
489 pattern <- "^contains\\(<([^>]+)>\\s*,\\s*\\((.*)\\)\\)\\s*$"
490 matches <- regexec(pattern, trimmed, perl = TRUE)
491 components <- regmatches(trimmed, matches)
492 if (length(components) == 0 || length(components[[1]]) < 3) {
493 return(query)
494 }
495
496 span <- components[[1]][2]
497 inner <- components[[1]][3]
498 parts <- strsplit(inner, "\\|", perl = TRUE)[[1]]
499 parts <- trimws(parts)
500 parts <- parts[nzchar(parts)]
501
502 if (length(parts) == 0) {
503 return(query)
504 }
505
506 focused <- paste0("focus({", parts, "})")
507 combined <- paste(focused, collapse = " | ")
508
509 sprintf("contains(<%s>, (%s))", span, combined)
510}
511
Marc Kupietz77852b22025-10-19 11:35:34 +0200512add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200513 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200514
515 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
516 return(result)
517 }
518
519 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
520 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
521 score_cols <- setdiff(numeric_cols, non_score_cols)
522
523 if (length(score_cols) == 0) {
524 return(result)
525 }
526
Marc Kupietz9894a372025-10-18 14:51:29 +0200527 compute_score_floor <- function(values) {
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200528 # Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz9894a372025-10-18 14:51:29 +0200529 finite_values <- values[is.finite(values)]
530 if (length(finite_values) == 0) {
531 return(0)
532 }
533
534 prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200535 # Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200536 q_val <- suppressWarnings(stats::quantile(finite_values,
537 probs = prob,
538 names = FALSE,
539 type = 7
540 ))
541
542 if (!is.finite(q_val)) {
543 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
544 }
545
546 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
547 if (!is.finite(min_val)) {
548 min_val <- 0
549 }
550
551 spread_candidates <- c(
552 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
553 stats::sd(finite_values, na.rm = TRUE),
554 abs(q_val) * 0.1,
555 abs(min_val - q_val)
556 )
557 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
558
559 spread <- 0
560 if (length(spread_candidates) > 0) {
561 spread <- max(spread_candidates)
562 }
563 if (!is.finite(spread) || spread == 0) {
564 spread <- max(abs(q_val), abs(min_val), 1e-06)
565 }
566
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200567 # Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200568 candidate <- q_val - spread
569 if (!is.finite(candidate)) {
570 candidate <- min_val
571 }
572
573 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
574 if (!is.finite(floor_value)) {
575 floor_value <- min_val
576 }
577 if (!is.finite(floor_value)) {
578 floor_value <- 0
579 }
580
581 floor_value
582 }
583
584 score_replacements <- stats::setNames(
585 vapply(score_cols, function(col) {
586 compute_score_floor(result[[col]])
587 }, numeric(1)),
588 score_cols
589 )
590
Marc Kupietzc4540a22025-10-14 17:39:53 +0200591 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200592 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
593 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200594 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200595 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200596 names_glue = "{.value}_{make.names(label)}",
597 values_fn = dplyr::first
598 )
599
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200600 raw_labels <- unique(result$label)
601 labels <- make.names(raw_labels)
602 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200603
Marc Kupietz28a29842025-10-18 12:25:09 +0200604 rank_data <- result |>
605 dplyr::distinct(node, collocate)
606
607 for (i in seq_along(raw_labels)) {
608 raw_lab <- raw_labels[i]
609 safe_lab <- labels[i]
610 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
611 if (nrow(label_df) == 0) {
612 next
613 }
614 label_df <- dplyr::distinct(label_df)
615 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
616 for (col in score_cols) {
617 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200618 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200619 values <- label_df[[col]]
620 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200621 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200622 valid_idx <- which(!is.na(values))
623 if (length(valid_idx) > 0) {
624 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200625 total <- length(valid_idx)
626 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200627 }
628 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200629 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200630 }
631 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
632 }
633
634 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
635
636 rank_replacements <- numeric(0)
637 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
638 if (length(rank_column_names) > 0) {
639 rank_replacements <- stats::setNames(
640 vapply(rank_column_names, function(col) {
641 col_values <- comparison[[col]]
642 valid_values <- col_values[!is.na(col_values)]
643 if (length(valid_values) == 0) {
644 nrow(comparison) + 1
645 } else {
646 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
647 }
648 }, numeric(1)),
649 rank_column_names
650 )
651 }
652
Marc Kupietz130a2a22025-10-18 16:09:23 +0200653 percentile_replacements <- numeric(0)
654 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
655 if (length(percentile_column_names) > 0) {
656 percentile_replacements <- stats::setNames(
657 rep(0, length(percentile_column_names)),
658 percentile_column_names
659 )
660 }
661
Marc Kupietz28a29842025-10-18 12:25:09 +0200662 collapse_label_values <- function(indices, safe_labels_vec) {
663 if (length(indices) == 0) {
664 return(NA_character_)
665 }
666 labs <- label_map[safe_labels_vec[indices]]
667 fallback <- safe_labels_vec[indices]
668 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
669 labs <- labs[!is.na(labs) & labs != ""]
670 if (length(labs) == 0) {
671 return(NA_character_)
672 }
673 paste(unique(labs), collapse = ", ")
674 }
675
Marc Kupietzc4540a22025-10-14 17:39:53 +0200676 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200677 fill_scores <- function(x, y, measure_col) {
678 replacement <- score_replacements[[measure_col]]
679 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
680 if (!is.finite(fallback_min)) {
681 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200682 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200683 if (!is.null(replacement) && is.finite(replacement)) {
684 replacement <- min(replacement, fallback_min)
685 } else {
686 replacement <- fallback_min
687 }
688 if (!is.finite(replacement)) {
689 replacement <- 0
690 }
691 if (any(is.na(x))) {
692 x[is.na(x)] <- replacement
693 }
694 if (any(is.na(y))) {
695 y[is.na(y)] <- replacement
696 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200697 list(x = x, y = y)
698 }
699
Marc Kupietz130a2a22025-10-18 16:09:23 +0200700 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
701 replacement_left <- percentile_replacements[[left_pct_col]]
702 if (is.null(replacement_left) || !is.finite(replacement_left)) {
703 replacement_left <- 0
704 }
705 replacement_right <- percentile_replacements[[right_pct_col]]
706 if (is.null(replacement_right) || !is.finite(replacement_right)) {
707 replacement_right <- 0
708 }
709 if (any(is.na(x))) {
710 x[is.na(x)] <- replacement_left
711 }
712 if (any(is.na(y))) {
713 y[is.na(y)] <- replacement_right
714 }
715 list(x = x, y = y)
716 }
717
Marc Kupietz28a29842025-10-18 12:25:09 +0200718 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
719 fallback <- nrow(comparison) + 1
720 replacement_left <- rank_replacements[[left_rank_col]]
721 if (is.null(replacement_left) || !is.finite(replacement_left)) {
722 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200723 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200724 replacement_right <- rank_replacements[[right_rank_col]]
725 if (is.null(replacement_right) || !is.finite(replacement_right)) {
726 replacement_right <- fallback
727 }
728 if (any(is.na(x))) {
729 x[is.na(x)] <- replacement_left
730 }
731 if (any(is.na(y))) {
732 y[is.na(y)] <- replacement_right
733 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200734 list(x = x, y = y)
735 }
736
737 left_label <- labels[1]
738 right_label <- labels[2]
739
740 for (col in score_cols) {
741 left_col <- paste0(col, "_", left_label)
742 right_col <- paste0(col, "_", right_label)
743 if (!all(c(left_col, right_col) %in% names(comparison))) {
744 next
745 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200746 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200747 comparison[[left_col]] <- filled$x
748 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200749 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200750 rank_left <- paste0("rank_", left_label, "_", col)
751 rank_right <- paste0("rank_", right_label, "_", col)
752 if (all(c(rank_left, rank_right) %in% names(comparison))) {
753 filled_rank <- fill_ranks(
754 comparison[[rank_left]],
755 comparison[[rank_right]],
756 rank_left,
757 rank_right
758 )
759 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
760 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200761 pct_left <- paste0("percentile_rank_", left_label, "_", col)
762 pct_right <- paste0("percentile_rank_", right_label, "_", col)
763 if (all(c(pct_left, pct_right) %in% names(comparison))) {
764 filled_pct <- fill_percentiles(
765 comparison[[pct_left]],
766 comparison[[pct_right]],
767 pct_left,
768 pct_right
769 )
770 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
771 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200772 }
773 }
774
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200775 for (col in score_cols) {
776 value_cols <- paste0(col, "_", labels)
777 existing <- value_cols %in% names(comparison)
778 if (!any(existing)) {
779 next
780 }
781 value_cols <- value_cols[existing]
782 safe_labels <- labels[existing]
783
784 score_values <- comparison[, value_cols, drop = FALSE]
785
786 winner_label_col <- paste0("winner_", col)
787 winner_value_col <- paste0("winner_", col, "_value")
788 runner_label_col <- paste0("runner_up_", col)
789 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200790 loser_label_col <- paste0("loser_", col)
791 loser_value_col <- paste0("loser_", col, "_value")
792 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200793
794 if (nrow(score_values) == 0) {
795 comparison[[winner_label_col]] <- character(0)
796 comparison[[winner_value_col]] <- numeric(0)
797 comparison[[runner_label_col]] <- character(0)
798 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200799 comparison[[loser_label_col]] <- character(0)
800 comparison[[loser_value_col]] <- numeric(0)
801 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200802 next
803 }
804
805 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200806 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200807
Marc Kupietzb2862d42025-10-18 10:17:49 +0200808 n_rows <- nrow(score_matrix)
809 winner_labels <- rep(NA_character_, n_rows)
810 winner_values <- rep(NA_real_, n_rows)
811 runner_labels <- rep(NA_character_, n_rows)
812 runner_values <- rep(NA_real_, n_rows)
813 loser_labels <- rep(NA_character_, n_rows)
814 loser_values <- rep(NA_real_, n_rows)
815 max_deltas <- rep(NA_real_, n_rows)
816
Marc Kupietzb2862d42025-10-18 10:17:49 +0200817 if (n_rows > 0) {
818 for (i in seq_len(n_rows)) {
819 numeric_row <- as.numeric(score_matrix[i, ])
820 if (all(is.na(numeric_row))) {
821 next
822 }
823
Marc Kupietz9894a372025-10-18 14:51:29 +0200824 replacement <- score_replacements[[col]]
825 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
826 if (!is.finite(fallback_min)) {
827 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200828 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200829 if (!is.null(replacement) && is.finite(replacement)) {
830 replacement <- min(replacement, fallback_min)
831 } else {
832 replacement <- fallback_min
833 }
834 if (!is.finite(replacement)) {
835 replacement <- 0
836 }
837 if (any(is.na(numeric_row))) {
838 numeric_row[is.na(numeric_row)] <- replacement
839 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200840 score_matrix[i, ] <- numeric_row
841
842 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
843 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200844 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200845 winner_values[i] <- max_val
846
847 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
848 if (length(unique_vals) >= 2) {
849 runner_val <- unique_vals[2]
850 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200851 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200852 runner_values[i] <- runner_val
853 }
854
855 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
856 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200857 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200858 loser_values[i] <- min_val
859
860 if (is.finite(max_val) && is.finite(min_val)) {
861 max_deltas[i] <- max_val - min_val
862 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200863 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200864 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200865
Marc Kupietzb2862d42025-10-18 10:17:49 +0200866 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200867 comparison[[winner_label_col]] <- winner_labels
868 comparison[[winner_value_col]] <- winner_values
869 comparison[[runner_label_col]] <- runner_labels
870 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200871 comparison[[loser_label_col]] <- loser_labels
872 comparison[[loser_value_col]] <- loser_values
873 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200874 }
875
Marc Kupietz28a29842025-10-18 12:25:09 +0200876 for (col in score_cols) {
877 rank_cols <- paste0("rank_", labels, "_", col)
878 existing <- rank_cols %in% names(comparison)
879 if (!any(existing)) {
880 next
881 }
882 rank_cols <- rank_cols[existing]
883 safe_labels <- labels[existing]
884 rank_values <- comparison[, rank_cols, drop = FALSE]
885
886 winner_rank_label_col <- paste0("winner_rank_", col)
887 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
888 runner_rank_label_col <- paste0("runner_up_rank_", col)
889 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
890 loser_rank_label_col <- paste0("loser_rank_", col)
891 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
892 max_delta_rank_col <- paste0("max_delta_rank_", col)
893
894 if (nrow(rank_values) == 0) {
895 comparison[[winner_rank_label_col]] <- character(0)
896 comparison[[winner_rank_value_col]] <- numeric(0)
897 comparison[[runner_rank_label_col]] <- character(0)
898 comparison[[runner_rank_value_col]] <- numeric(0)
899 comparison[[loser_rank_label_col]] <- character(0)
900 comparison[[loser_rank_value_col]] <- numeric(0)
901 comparison[[max_delta_rank_col]] <- numeric(0)
902 next
903 }
904
905 rank_matrix <- as.matrix(rank_values)
906 storage.mode(rank_matrix) <- "numeric"
907
908 n_rows <- nrow(rank_matrix)
909 winner_labels <- rep(NA_character_, n_rows)
910 winner_values <- rep(NA_real_, n_rows)
911 runner_labels <- rep(NA_character_, n_rows)
912 runner_values <- rep(NA_real_, n_rows)
913 loser_labels <- rep(NA_character_, n_rows)
914 loser_values <- rep(NA_real_, n_rows)
915 max_deltas <- rep(NA_real_, n_rows)
916
917 for (i in seq_len(n_rows)) {
918 numeric_row <- as.numeric(rank_matrix[i, ])
919 if (all(is.na(numeric_row))) {
920 next
921 }
922
923 if (length(rank_cols) > 0) {
924 replacement_vec <- rank_replacements[rank_cols]
925 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
926 missing_idx <- which(is.na(numeric_row))
927 if (length(missing_idx) > 0) {
928 numeric_row[missing_idx] <- replacement_vec[missing_idx]
929 }
930 }
931
932 valid_idx <- seq_along(numeric_row)
933 valid_values <- numeric_row[valid_idx]
934 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
935 min_positions <- valid_idx[which(valid_values == min_val)]
936 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
937 winner_values[i] <- min_val
938
939 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
940 if (length(ordered_vals) >= 2) {
941 runner_val <- ordered_vals[2]
942 runner_positions <- valid_idx[which(valid_values == runner_val)]
943 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
944 runner_values[i] <- runner_val
945 }
946
947 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
948 max_positions <- valid_idx[which(valid_values == max_val)]
949 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
950 loser_values[i] <- max_val
951
952 if (is.finite(max_val) && is.finite(min_val)) {
953 max_deltas[i] <- max_val - min_val
954 }
955 }
956
957 comparison[[winner_rank_label_col]] <- winner_labels
958 comparison[[winner_rank_value_col]] <- winner_values
959 comparison[[runner_rank_label_col]] <- runner_labels
960 comparison[[runner_rank_value_col]] <- runner_values
961 comparison[[loser_rank_label_col]] <- loser_labels
962 comparison[[loser_rank_value_col]] <- loser_values
963 comparison[[max_delta_rank_col]] <- max_deltas
964 }
965
Marc Kupietz130a2a22025-10-18 16:09:23 +0200966 for (col in score_cols) {
967 pct_cols <- paste0("percentile_rank_", labels, "_", col)
968 existing <- pct_cols %in% names(comparison)
969 if (!any(existing)) {
970 next
971 }
972 pct_cols <- pct_cols[existing]
973 safe_labels <- labels[existing]
974 pct_values <- comparison[, pct_cols, drop = FALSE]
975
976 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
977 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
978 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
979 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
980 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
981 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
982 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
983
984 if (nrow(pct_values) == 0) {
985 comparison[[winner_pct_label_col]] <- character(0)
986 comparison[[winner_pct_value_col]] <- numeric(0)
987 comparison[[runner_pct_label_col]] <- character(0)
988 comparison[[runner_pct_value_col]] <- numeric(0)
989 comparison[[loser_pct_label_col]] <- character(0)
990 comparison[[loser_pct_value_col]] <- numeric(0)
991 comparison[[max_delta_pct_col]] <- numeric(0)
992 next
993 }
994
995 pct_matrix <- as.matrix(pct_values)
996 storage.mode(pct_matrix) <- "numeric"
997
998 n_rows <- nrow(pct_matrix)
999 winner_labels <- rep(NA_character_, n_rows)
1000 winner_values <- rep(NA_real_, n_rows)
1001 runner_labels <- rep(NA_character_, n_rows)
1002 runner_values <- rep(NA_real_, n_rows)
1003 loser_labels <- rep(NA_character_, n_rows)
1004 loser_values <- rep(NA_real_, n_rows)
1005 max_deltas <- rep(NA_real_, n_rows)
1006
1007 if (n_rows > 0) {
1008 for (i in seq_len(n_rows)) {
1009 numeric_row <- as.numeric(pct_matrix[i, ])
1010 if (all(is.na(numeric_row))) {
1011 next
1012 }
1013
1014 if (any(is.na(numeric_row))) {
1015 numeric_row[is.na(numeric_row)] <- 0
1016 }
1017 pct_matrix[i, ] <- numeric_row
1018
1019 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
1020 max_idx <- which(numeric_row == max_val)
1021 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
1022 winner_values[i] <- max_val
1023
1024 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
1025 if (length(unique_vals) >= 2) {
1026 runner_val <- unique_vals[2]
1027 runner_idx <- which(numeric_row == runner_val)
1028 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
1029 runner_values[i] <- runner_val
1030 }
1031
1032 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
1033 min_idx <- which(numeric_row == min_val)
1034 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
1035 loser_values[i] <- min_val
1036
1037 if (is.finite(max_val) && is.finite(min_val)) {
1038 max_deltas[i] <- max_val - min_val
1039 }
1040 }
1041 }
1042
1043 comparison[, pct_cols] <- pct_matrix
1044 comparison[[winner_pct_label_col]] <- winner_labels
1045 comparison[[winner_pct_value_col]] <- winner_values
1046 comparison[[runner_pct_label_col]] <- runner_labels
1047 comparison[[runner_pct_value_col]] <- runner_values
1048 comparison[[loser_pct_label_col]] <- loser_labels
1049 comparison[[loser_pct_value_col]] <- loser_values
1050 comparison[[max_delta_pct_col]] <- max_deltas
1051 }
1052
Marc Kupietzc4540a22025-10-14 17:39:53 +02001053 dplyr::left_join(result, comparison, by = c("node", "collocate"))
1054}
1055
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001056#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +02001057#' @importFrom stringr str_detect
1058#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1059#'
1060matches2FreqTable <- function(matches,
1061 index = 0,
1062 minOccur = 5,
1063 leftContextSize = 5,
1064 rightContextSize = 5,
1065 ignoreCollocateCase = FALSE,
1066 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001067 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +02001068 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1069 verbose = TRUE) {
1070 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1071 frequency <- NULL
1072
1073 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001074 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +02001075 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001076 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001077 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001078 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
1079 ignoreCollocateCase = ignoreCollocateCase,
1080 stopwords = stopwords, oldTable = oldTable, verbose = verbose
1081 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001082 }
1083 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +02001084 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001085 oldTable <- matches2FreqTable(
1086 matches,
1087 i,
1088 leftContextSize = leftContextSize,
1089 rightContextSize = rightContextSize,
1090 collocateFilterRegex = collocateFilterRegex,
1091 oldTable = oldTable,
1092 stopwords = stopwords
1093 )
1094 }
1095 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001096 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001097 group_by(word) |>
1098 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001099 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001100 arrange(desc(frequency))
1101 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001102 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +02001103
1104 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
1105
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001106 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001107
1108 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
1109
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001110 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001111
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001112 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001113 oldTable
1114 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001115 table(c(left, right)) |>
1116 dplyr::as_tibble(.name_repair = "minimal") |>
1117 dplyr::rename(word = 1, frequency = 2) |>
1118 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001119 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001120 dplyr::bind_rows(oldTable)
1121 }
1122 }
1123}
1124
1125#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001126#' @importFrom stringr str_match str_split str_detect
1127#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1128#'
1129snippet2FreqTable <- function(snippet,
1130 minOccur = 5,
1131 leftContextSize = 5,
1132 rightContextSize = 5,
1133 ignoreCollocateCase = FALSE,
1134 stopwords = c(),
1135 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001136 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001137 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1138 verbose = TRUE) {
1139 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1140 frequency <- NULL
1141
1142 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001143 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001144 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +02001145 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001146 for (s in snippet) {
1147 oldTable <- snippet2FreqTable(
1148 s,
1149 leftContextSize = leftContextSize,
1150 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +01001151 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001152 oldTable = oldTable,
1153 stopwords = stopwords
1154 )
1155 }
Marc Kupietza47d1502023-04-18 15:26:47 +02001156 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001157 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001158 group_by(word) |>
1159 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001160 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001161 arrange(desc(frequency))
1162 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001163 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001164 match <-
1165 str_match(
1166 snippet,
1167 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
1168 )
1169
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001170 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001171 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001172 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001173 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001174 }
1175 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001176
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001177 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001178 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001179 } else {
1180 ""
1181 }
1182 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001183
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001184 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001185 oldTable
1186 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001187 table(c(left, right)) |>
1188 dplyr::as_tibble(.name_repair = "minimal") |>
1189 dplyr::rename(word = 1, frequency = 2) |>
1190 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001191 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001192 dplyr::bind_rows(oldTable)
1193 }
1194 }
1195}
1196
1197#' Preliminary synsemantic stopwords function
1198#'
1199#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001200#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001201#'
1202#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1203#'
1204#' @details
1205#' Currently only suitable for German. See stopwords package for other languages.
1206#'
1207#' @param ... future arguments for language detection
1208#'
1209#' @family collocation analysis functions
1210#' @return Vector of synsemantic stopwords.
1211#' @export
1212synsemanticStopwords <- function(...) {
Marc Kupietzc79155b2025-10-19 13:42:55 +02001213 base <- c(
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001214 "der",
1215 "die",
1216 "und",
1217 "in",
1218 "den",
1219 "von",
1220 "mit",
1221 "das",
1222 "zu",
1223 "im",
1224 "ist",
1225 "auf",
1226 "sich",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001227 "des",
1228 "dem",
1229 "nicht",
1230 "ein",
1231 "eine",
1232 "es",
1233 "auch",
1234 "an",
1235 "als",
1236 "am",
1237 "aus",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001238 "bei",
1239 "er",
1240 "dass",
1241 "sie",
1242 "nach",
1243 "um",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001244 "zum",
1245 "noch",
1246 "war",
1247 "einen",
1248 "einer",
1249 "wie",
1250 "einem",
1251 "vor",
1252 "bis",
1253 "\u00fcber",
1254 "so",
1255 "aber",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001256 "diese",
Marc Kupietzc79155b2025-10-19 13:42:55 +02001257 "oder"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001258 )
Marc Kupietzc79155b2025-10-19 13:42:55 +02001259
1260 lower <- unique(tolower(base))
1261 capitalized <- paste0(toupper(substr(lower, 1, 1)), substring(lower, 2))
1262
1263 unique(c(lower, capitalized))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001264}
1265
Marc Kupietz5a336b62021-11-27 17:51:35 +01001266
Marc Kupietz76b05592021-12-19 16:26:15 +01001267# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001268findExample <-
1269 function(kco,
1270 query,
1271 vc = "",
1272 matchOnly = TRUE) {
1273 out <- character(length = length(query))
1274
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001275 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001276 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001277 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001278
1279 for (i in seq_along(query)) {
1280 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001281 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001282 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001283 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001284 out[i] <- if (matchOnly) {
1285 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001286 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001287 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001288 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001289 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001290 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001291 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001292 }
1293 out
1294 }
1295
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001296collocatesQuery <-
1297 function(kco,
1298 query,
1299 vc = "",
1300 minOccur = 5,
1301 leftContextSize = 5,
1302 rightContextSize = 5,
1303 searchHitsSampleLimit = 20000,
1304 ignoreCollocateCase = FALSE,
1305 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001306 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001307 ...) {
1308 frequency <- NULL
1309 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001310 if (q@totalResults == 0) {
1311 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001312 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001313 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1314 matches2FreqTable(q@collectedMatches,
1315 0,
1316 minOccur = minOccur,
1317 leftContextSize = leftContextSize,
1318 rightContextSize = rightContextSize,
1319 ignoreCollocateCase = ignoreCollocateCase,
1320 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001321 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001322 ...,
1323 verbose = kco@verbose
1324 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001325 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001326 filter(frequency >= minOccur)
1327 }
1328 }