blob: 4344d1ecb4846676407c545332f53d3562caac3e [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
Marc Kupietz2b0b0a12025-10-19 14:49:14 +020041#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls (only applied when \code{maxRecurse > 0})
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively (only applied when \code{maxRecurse > 0})
Marc Kupietz7d400e02021-12-19 16:39:36 +010043#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzde679ea2025-10-19 13:14:51 +020045#' @param queryMissingScores if TRUE, attempt to retrieve corpus-based association scores for vc/collocate combinations that would otherwise be imputed, by re-querying the KorAP backend without applying the collocate frequency threshold
Marc Kupietz4cbb5472025-10-19 12:15:25 +020046#' @param missingScoreQuantile lower quantile (evaluated per association measure) that anchors the adaptive floor used for imputing missing scores between virtual corpora; a robust spread is subtracted from this anchor so the imputed values stay below the weakest observed scores
Marc Kupietze34a8be2025-10-17 20:13:42 +020047#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020048#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' @inheritParams collocationScoreQuery,KorAPConnection-method
Marc Kupietz130a2a22025-10-18 16:09:23 +020050#' @return
51#' A tibble where each row represents a candidate collocate for the requested node.
52#' Columns include (depending on the selected association measures):
53#'
54#' \itemize{
55#' \item \code{node}, \code{collocate}, \code{vc}, \code{label}: identifiers for the query node, collocate, virtual corpus, and optional label.
56#' \item Frequency and contingency information such as \code{frequency}, \code{O}, \code{O1}, \code{O2}, \code{E}, \code{leftContextSize}, \code{rightContextSize}, and \code{w}.
57#' \item Association measures (e.g. \code{logDice}, \code{ll}, \code{mi}, ...), one column per requested scorer.
58#' \item Per-labelled association scores produced by multi-VC comparisons using the pattern \code{<measure>_<label>}.
59#' \item Ranks per label/measure with the pattern \code{rank_<label>_<measure>} (1 is best) and the corresponding percentile ranks \code{percentile_rank_<label>_<measure>}.
60#' \item Pairwise contrasts for two-label comparisons, e.g. \code{delta_<measure>}, \code{delta_rank_<measure>}, and \code{delta_percentile_rank_<measure>}.
61#' \item Summary columns describing the strongest labels per measure (\code{winner_*}, \code{runner_up_*}, \code{loser_*}, and \code{max_delta_*}).
62#' \item Optional helper columns such as \code{query}, \code{example}, or \code{url} when example retrieval is requested.
63#' }
Marc Kupietzc4540a22025-10-14 17:39:53 +020064#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020066#' @importFrom tidyr expand_grid pivot_wider
67#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020068#'
69#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020070#' \dontrun{
71#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020072#' # Find top collocates of "Packung" inside and outside the sports domain.
73#' KorAPConnection(verbose = TRUE) |>
74#' collocationAnalysis("Packung",
75#' vc = c("textClass=sport", "textClass!=sport"),
76#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
77#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020078#' dplyr::filter(logDice >= 5)
79#' }
80#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020081#' \dontrun{
82#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020083#' # Identify the most prominent light verb construction with "in ... setzen".
84#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010085#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020087#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
88#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089#' }
90#'
91#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020092setMethod(
93 "collocationAnalysis", "KorAPConnection",
94 function(kco,
95 node,
96 vc = "",
97 lemmatizeNodeQuery = FALSE,
98 minOccur = 5,
99 leftContextSize = 5,
100 rightContextSize = 5,
101 topCollocatesLimit = 200,
102 searchHitsSampleLimit = 20000,
103 ignoreCollocateCase = FALSE,
104 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
105 exactFrequencies = TRUE,
106 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
107 seed = 7,
108 expand = length(vc) != length(node),
109 maxRecurse = 0,
110 addExamples = FALSE,
111 thresholdScore = "logDice",
112 threshold = 2.0,
113 localStopwords = c(),
114 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzde679ea2025-10-19 13:14:51 +0200115 queryMissingScores = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200116 missingScoreQuantile = 0.05,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200117 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200118 ...) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200119 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200120
Marc Kupietzb2862d42025-10-18 10:17:49 +0200121 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nzchar(withinSpan))) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200122 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
123 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200124
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200125 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200126
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200127 if (lemmatizeNodeQuery) {
128 node <- lemmatizeWordQuery(node)
129 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200130
Marc Kupietze34a8be2025-10-17 20:13:42 +0200131 vcNames <- names(vc)
Marc Kupietze34a8be2025-10-17 20:13:42 +0200132 if (is.null(vcNames)) {
133 vcNames <- rep(NA_character_, length(vc))
Marc Kupietze34a8be2025-10-17 20:13:42 +0200134 }
135
136 label_lookup <- NULL
Marc Kupietzb2862d42025-10-18 10:17:49 +0200137 if (!is.null(names(vc)) && length(vc) > 0) {
138 raw_names <- names(vc)
139 if (any(!is.na(raw_names) & raw_names != "")) {
140 label_lookup <- stats::setNames(raw_names, vc)
141 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200142 }
143
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200144 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200145 grid <- if (expand) {
Marc Kupietzb2862d42025-10-18 10:17:49 +0200146 tmp_grid <- tidyr::expand_grid(node = node, idx = seq_along(vc))
147 tmp_grid$vc <- vc[tmp_grid$idx]
148 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
149 tmp_grid[, c("node", "vc", "vcLabel"), drop = FALSE]
Marc Kupietze34a8be2025-10-17 20:13:42 +0200150 } else {
151 tibble(node = node, vc = vc, vcLabel = vcNames)
152 }
153
154 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200155 collocationAnalysis(kco,
156 node = node,
157 vc = vc,
158 minOccur = minOccur,
159 leftContextSize = leftContextSize,
160 rightContextSize = rightContextSize,
161 topCollocatesLimit = topCollocatesLimit,
162 searchHitsSampleLimit = searchHitsSampleLimit,
163 ignoreCollocateCase = ignoreCollocateCase,
164 withinSpan = withinSpan,
165 exactFrequencies = exactFrequencies,
166 stopwords = stopwords,
167 addExamples = TRUE,
168 localStopwords = localStopwords,
169 seed = seed,
170 expand = expand,
Marc Kupietz9894a372025-10-18 14:51:29 +0200171 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200172 queryMissingScores = queryMissingScores,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200173 collocateFilterRegex = collocateFilterRegex,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200174 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200175 ...
176 )
177 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200178 bind_rows()
179
180 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
181 multi_result
182 } else {
Marc Kupietzde679ea2025-10-19 13:14:51 +0200183 if (queryMissingScores) {
184 multi_result <- backfill_missing_scores(
185 multi_result,
186 grid = grid,
187 kco = kco,
188 ignoreCollocateCase = ignoreCollocateCase,
189 ...
190 )
191 }
192
Marc Kupietze34a8be2025-10-17 20:13:42 +0200193 if (!"label" %in% names(multi_result)) {
194 multi_result$label <- NA_character_
195 }
196
197 if (!is.null(label_lookup)) {
198 override <- unname(label_lookup[multi_result$vc])
199 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
200 if (any(missing_idx)) {
201 multi_result$label[missing_idx] <- override[missing_idx]
202 }
203 }
204
205 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
206 if (any(missing_idx)) {
207 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
208 }
209
Marc Kupietze31322e2025-10-17 18:55:36 +0200210 multi_result |>
Marc Kupietz9894a372025-10-18 14:51:29 +0200211 add_multi_vc_comparisons(
Marc Kupietz9894a372025-10-18 14:51:29 +0200212 missingScoreQuantile = missingScoreQuantile
213 )
Marc Kupietze31322e2025-10-17 18:55:36 +0200214 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200215 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200216 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
217 vcLabel <- vcNames[1]
218 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200219
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200220 set.seed(seed)
221 candidates <- collocatesQuery(
222 kco,
223 node,
224 vc = vc,
225 minOccur = minOccur,
226 leftContextSize = leftContextSize,
227 rightContextSize = rightContextSize,
228 searchHitsSampleLimit = searchHitsSampleLimit,
229 ignoreCollocateCase = ignoreCollocateCase,
230 stopwords = append(stopwords, localStopwords),
Marc Kupietzb2862d42025-10-18 10:17:49 +0200231 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200232 ...
233 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200234
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200235 if (nrow(candidates) > 0) {
236 candidates <- candidates |>
237 filter(frequency >= minOccur) |>
238 slice_head(n = topCollocatesLimit)
239 collocationScoreQuery(
240 kco,
241 node = node,
242 collocate = candidates$word,
243 vc = vc,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 observed = if (exactFrequencies) NA else candidates$frequency,
247 ignoreCollocateCase = ignoreCollocateCase,
248 withinSpan = withinSpan,
249 ...
250 ) |>
251 filter(O >= minOccur) |>
252 dplyr::arrange(dplyr::desc(logDice))
253 } else {
254 tibble()
255 }
256 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200257
258 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
259 result$label <- rep(vcLabel, nrow(result))
260 }
261
262 threshold_col <- thresholdScore
263 if (maxRecurse > 0 && nrow(result) > 0 && threshold_col %in% names(result)) {
264 threshold_values <- result[[threshold_col]]
265 eligible_idx <- which(!is.na(threshold_values) & threshold_values >= threshold)
266 if (length(eligible_idx) > 0) {
267 recurseWith <- result[eligible_idx, , drop = FALSE]
268 result <- collocationAnalysis(
269 kco,
270 node = paste0("(", buildCollocationQuery(
271 removeWithinSpan(recurseWith$node, withinSpan),
272 recurseWith$collocate,
273 leftContextSize = leftContextSize,
274 rightContextSize = rightContextSize,
275 withinSpan = ""
276 ), ")"),
277 vc = vc,
278 minOccur = minOccur,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200279 leftContextSize = leftContextSize,
280 rightContextSize = rightContextSize,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200281 withinSpan = withinSpan,
282 maxRecurse = maxRecurse - 1,
283 stopwords = stopwords,
284 localStopwords = recurseWith$collocate,
285 exactFrequencies = exactFrequencies,
286 searchHitsSampleLimit = searchHitsSampleLimit,
287 topCollocatesLimit = topCollocatesLimit,
288 addExamples = FALSE,
Marc Kupietz9894a372025-10-18 14:51:29 +0200289 missingScoreQuantile = missingScoreQuantile,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200290 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzde679ea2025-10-19 13:14:51 +0200291 queryMissingScores = queryMissingScores,
Marc Kupietz2b0b0a12025-10-19 14:49:14 +0200292 thresholdScore = thresholdScore,
293 threshold = threshold,
Marc Kupietzb2862d42025-10-18 10:17:49 +0200294 vcLabel = vcLabel
295 ) |>
Marc Kupietz2b0b0a12025-10-19 14:49:14 +0200296 bind_rows(result)
297
298 if (threshold_col %in% names(result)) {
299 threshold_values <- result[[threshold_col]]
300 keep_idx <- is.na(threshold_values) | threshold_values >= threshold
301 result <- result[keep_idx, , drop = FALSE]
302 }
303
304 result <- result |>
Marc Kupietzb2862d42025-10-18 10:17:49 +0200305 filter(O >= minOccur) |>
306 dplyr::arrange(dplyr::desc(logDice))
307 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200308 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200309
310 if (addExamples && nrow(result) > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200311 result$query <- buildCollocationQuery(
312 result$node,
313 result$collocate,
314 leftContextSize = leftContextSize,
315 rightContextSize = rightContextSize,
316 withinSpan = withinSpan
317 )
318 result$example <- findExample(
319 kco,
320 query = result$query,
321 vc = result$vc
322 )
323 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200324
Marc Kupietz0a292632025-10-19 14:04:36 +0200325 if (!is.null(withinSpan) && !is.na(withinSpan) && nzchar(withinSpan) &&
326 nrow(result) > 0 &&
327 "webUIRequestUrl" %in% names(result) &&
328 "query" %in% names(result)) {
329 candidate_rows <- which(!is.na(result$node) &
330 !grepl("focus\\(", result$node, perl = TRUE) &
331 !is.na(result$query) & nzchar(result$query))
332
333 if (length(candidate_rows) > 0) {
334 focused_queries <- vapply(
335 result$query[candidate_rows],
336 inject_focus_into_query,
337 character(1)
338 )
339
340 changed <- focused_queries != result$query[candidate_rows]
341 if (any(changed)) {
342 indices <- candidate_rows[changed]
343 vc_values <- as.character(result$vc)
344 vc_values[is.na(vc_values)] <- ""
345
346 result$webUIRequestUrl[indices] <- mapply(
347 function(new_query, vc_value) {
348 buildWebUIRequestUrlFromString(
349 kco@KorAPUrl,
350 new_query,
351 vc = vc_value,
352 ql = "poliqarp"
353 )
354 },
355 focused_queries[changed],
356 vc_values[indices],
357 USE.NAMES = FALSE
358 )
359 }
360 }
361 }
362
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200363 result
364 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200365)
366
Marc Kupietz76b05592021-12-19 16:26:15 +0100367# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100368removeWithinSpan <- function(query, withinSpan) {
369 if (withinSpan == "") {
370 return(query)
371 }
372 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200373 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100374 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200375 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100376 return(res)
377}
378
Marc Kupietzde679ea2025-10-19 13:14:51 +0200379backfill_missing_scores <- function(result,
380 grid,
381 kco,
382 ignoreCollocateCase,
383 ...) {
384 if (!"vc" %in% names(result) || !"node" %in% names(result) || !"collocate" %in% names(result)) {
385 return(result)
386 }
387
388 if (nrow(result) == 0) {
389 return(result)
390 }
391
392 distinct_pairs <- dplyr::distinct(result, node, collocate)
393 if (nrow(distinct_pairs) == 0) {
394 return(result)
395 }
396
397 collocates_by_node <- split(as.character(distinct_pairs$collocate), distinct_pairs$node)
398 if (length(collocates_by_node) == 0) {
399 return(result)
400 }
401
402 required_combinations <- unique(as.data.frame(grid[, c("node", "vc", "vcLabel")], drop = FALSE))
403 for (i in seq_len(nrow(required_combinations))) {
404 node_value <- required_combinations$node[i]
405 vc_value <- required_combinations$vc[i]
406
407 collocate_pool <- collocates_by_node[[node_value]]
408 if (is.null(collocate_pool) || length(collocate_pool) == 0) {
409 next
410 }
411
412 existing_idx <- result$node == node_value & result$vc == vc_value
413 existing_collocates <- unique(as.character(result$collocate[existing_idx]))
414 missing_collocates <- setdiff(unique(collocate_pool), existing_collocates)
415 missing_collocates <- missing_collocates[!is.na(missing_collocates) & nzchar(missing_collocates)]
416
417 if (length(missing_collocates) == 0) {
418 next
419 }
420
421 context_rows <- result[result$node == node_value & result$vc == vc_value, , drop = FALSE]
422 if (nrow(context_rows) == 0) {
423 context_rows <- result[result$node == node_value, , drop = FALSE]
424 }
425
426 left_size <- context_rows$leftContextSize[!is.na(context_rows$leftContextSize)][1]
427 if (is.na(left_size) || length(left_size) == 0) {
428 left_size <- result$leftContextSize[!is.na(result$leftContextSize)][1]
429 }
430 if (is.na(left_size) || length(left_size) == 0) {
431 left_size <- 5
432 }
433
434 right_size <- context_rows$rightContextSize[!is.na(context_rows$rightContextSize)][1]
435 if (is.na(right_size) || length(right_size) == 0) {
436 right_size <- result$rightContextSize[!is.na(result$rightContextSize)][1]
437 }
438 if (is.na(right_size) || length(right_size) == 0) {
439 right_size <- 5
440 }
441
442 within_span_value <- ""
443 if ("query" %in% names(context_rows)) {
444 query_candidate <- context_rows$query[!is.na(context_rows$query) & nzchar(context_rows$query)][1]
445 if (!is.na(query_candidate) && nzchar(query_candidate)) {
446 match_one <- regexec("^\\(*contains\\(<([^>]+)>,", query_candidate)
447 matches <- regmatches(query_candidate, match_one)
448 if (length(matches) >= 1 && length(matches[[1]]) >= 2) {
449 within_span_value <- matches[[1]][2]
450 }
451 }
452 }
453
454 new_rows <- collocationScoreQuery(
455 kco,
456 node = node_value,
457 collocate = missing_collocates,
458 vc = vc_value,
459 leftContextSize = left_size,
460 rightContextSize = right_size,
461 ignoreCollocateCase = ignoreCollocateCase,
462 withinSpan = within_span_value,
463 ...
464 )
465
466 if (nrow(new_rows) == 0) {
467 next
468 }
469
470 if (!is.null(required_combinations$vcLabel[i]) && !is.na(required_combinations$vcLabel[i]) && required_combinations$vcLabel[i] != "" && "label" %in% names(new_rows)) {
471 new_rows$label <- required_combinations$vcLabel[i]
472 }
473
474 result <- dplyr::bind_rows(result, new_rows)
475 }
476
477 result
478}
479
Marc Kupietz0a292632025-10-19 14:04:36 +0200480inject_focus_into_query <- function(query) {
481 if (is.null(query) || is.na(query)) {
482 return(query)
483 }
484
485 trimmed <- trimws(query)
486 if (!nzchar(trimmed)) {
487 return(query)
488 }
489
490 if (!grepl("^contains\\(<[^>]+>", trimmed, perl = TRUE)) {
491 return(query)
492 }
493
494 if (grepl("focus\\(", trimmed, perl = TRUE)) {
495 return(query)
496 }
497
498 pattern <- "^contains\\(<([^>]+)>\\s*,\\s*\\((.*)\\)\\)\\s*$"
499 matches <- regexec(pattern, trimmed, perl = TRUE)
500 components <- regmatches(trimmed, matches)
501 if (length(components) == 0 || length(components[[1]]) < 3) {
502 return(query)
503 }
504
505 span <- components[[1]][2]
506 inner <- components[[1]][3]
507 parts <- strsplit(inner, "\\|", perl = TRUE)[[1]]
508 parts <- trimws(parts)
509 parts <- parts[nzchar(parts)]
510
511 if (length(parts) == 0) {
512 return(query)
513 }
514
515 focused <- paste0("focus({", parts, "})")
516 combined <- paste(focused, collapse = " | ")
517
518 sprintf("contains(<%s>, (%s))", span, combined)
519}
520
Marc Kupietz77852b22025-10-19 11:35:34 +0200521add_multi_vc_comparisons <- function(result, missingScoreQuantile = 0.05) {
Marc Kupietz28a29842025-10-18 12:25:09 +0200522 label <- node <- collocate <- NULL
Marc Kupietzc4540a22025-10-14 17:39:53 +0200523
524 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
525 return(result)
526 }
527
528 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
529 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
530 score_cols <- setdiff(numeric_cols, non_score_cols)
531
532 if (length(score_cols) == 0) {
533 return(result)
534 }
535
Marc Kupietz9894a372025-10-18 14:51:29 +0200536 compute_score_floor <- function(values) {
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200537 # Estimate a conservative floor so missing scores can be imputed without favoring any label
Marc Kupietz9894a372025-10-18 14:51:29 +0200538 finite_values <- values[is.finite(values)]
539 if (length(finite_values) == 0) {
540 return(0)
541 }
542
543 prob <- min(max(missingScoreQuantile, 0), 0.5)
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200544 # Use a lower quantile as the anchor to stay near the weakest attested scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200545 q_val <- suppressWarnings(stats::quantile(finite_values,
546 probs = prob,
547 names = FALSE,
548 type = 7
549 ))
550
551 if (!is.finite(q_val)) {
552 q_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
553 }
554
555 min_val <- suppressWarnings(min(finite_values, na.rm = TRUE))
556 if (!is.finite(min_val)) {
557 min_val <- 0
558 }
559
560 spread_candidates <- c(
561 suppressWarnings(stats::IQR(finite_values, na.rm = TRUE, type = 7)),
562 stats::sd(finite_values, na.rm = TRUE),
563 abs(q_val) * 0.1,
564 abs(min_val - q_val)
565 )
566 spread_candidates <- spread_candidates[is.finite(spread_candidates)]
567
568 spread <- 0
569 if (length(spread_candidates) > 0) {
570 spread <- max(spread_candidates)
571 }
572 if (!is.finite(spread) || spread == 0) {
573 spread <- max(abs(q_val), abs(min_val), 1e-06)
574 }
575
Marc Kupietz4cbb5472025-10-19 12:15:25 +0200576 # Step away from the anchor by a robust spread estimate to avoid ties with real scores
Marc Kupietz9894a372025-10-18 14:51:29 +0200577 candidate <- q_val - spread
578 if (!is.finite(candidate)) {
579 candidate <- min_val
580 }
581
582 floor_value <- suppressWarnings(min(c(candidate, min_val), na.rm = TRUE))
583 if (!is.finite(floor_value)) {
584 floor_value <- min_val
585 }
586 if (!is.finite(floor_value)) {
587 floor_value <- 0
588 }
589
590 floor_value
591 }
592
593 score_replacements <- stats::setNames(
594 vapply(score_cols, function(col) {
595 compute_score_floor(result[[col]])
596 }, numeric(1)),
597 score_cols
598 )
599
Marc Kupietzc4540a22025-10-14 17:39:53 +0200600 comparison <- result |>
Marc Kupietz28a29842025-10-18 12:25:09 +0200601 dplyr::select(node, collocate, label, dplyr::all_of(score_cols)) |>
602 tidyr::pivot_wider(
Marc Kupietzc4540a22025-10-14 17:39:53 +0200603 names_from = label,
Marc Kupietz28a29842025-10-18 12:25:09 +0200604 values_from = dplyr::all_of(score_cols),
Marc Kupietzc4540a22025-10-14 17:39:53 +0200605 names_glue = "{.value}_{make.names(label)}",
606 values_fn = dplyr::first
607 )
608
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200609 raw_labels <- unique(result$label)
610 labels <- make.names(raw_labels)
611 label_map <- stats::setNames(raw_labels, labels)
Marc Kupietzc4540a22025-10-14 17:39:53 +0200612
Marc Kupietz28a29842025-10-18 12:25:09 +0200613 rank_data <- result |>
614 dplyr::distinct(node, collocate)
615
616 for (i in seq_along(raw_labels)) {
617 raw_lab <- raw_labels[i]
618 safe_lab <- labels[i]
619 label_df <- result[result$label == raw_lab, c("node", "collocate", score_cols), drop = FALSE]
620 if (nrow(label_df) == 0) {
621 next
622 }
623 label_df <- dplyr::distinct(label_df)
624 rank_tbl <- label_df[, c("node", "collocate"), drop = FALSE]
625 for (col in score_cols) {
626 rank_col_name <- paste0("rank_", safe_lab, "_", col)
Marc Kupietz130a2a22025-10-18 16:09:23 +0200627 percentile_col_name <- paste0("percentile_rank_", safe_lab, "_", col)
Marc Kupietz28a29842025-10-18 12:25:09 +0200628 values <- label_df[[col]]
629 ranks <- rep(NA_real_, length(values))
Marc Kupietz130a2a22025-10-18 16:09:23 +0200630 percentiles <- rep(NA_real_, length(values))
Marc Kupietz28a29842025-10-18 12:25:09 +0200631 valid_idx <- which(!is.na(values))
632 if (length(valid_idx) > 0) {
633 ranks[valid_idx] <- rank(-values[valid_idx], ties.method = "first")
Marc Kupietz130a2a22025-10-18 16:09:23 +0200634 total <- length(valid_idx)
635 percentiles[valid_idx] <- 1 - (ranks[valid_idx] - 1) / total
Marc Kupietz28a29842025-10-18 12:25:09 +0200636 }
637 rank_tbl[[rank_col_name]] <- ranks
Marc Kupietz130a2a22025-10-18 16:09:23 +0200638 rank_tbl[[percentile_col_name]] <- percentiles
Marc Kupietz28a29842025-10-18 12:25:09 +0200639 }
640 rank_data <- dplyr::left_join(rank_data, rank_tbl, by = c("node", "collocate"))
641 }
642
643 comparison <- dplyr::left_join(comparison, rank_data, by = c("node", "collocate"))
644
645 rank_replacements <- numeric(0)
646 rank_column_names <- grep("^rank_", names(comparison), value = TRUE)
647 if (length(rank_column_names) > 0) {
648 rank_replacements <- stats::setNames(
649 vapply(rank_column_names, function(col) {
650 col_values <- comparison[[col]]
651 valid_values <- col_values[!is.na(col_values)]
652 if (length(valid_values) == 0) {
653 nrow(comparison) + 1
654 } else {
655 suppressWarnings(max(valid_values, na.rm = TRUE)) + 1
656 }
657 }, numeric(1)),
658 rank_column_names
659 )
660 }
661
Marc Kupietz130a2a22025-10-18 16:09:23 +0200662 percentile_replacements <- numeric(0)
663 percentile_column_names <- grep("^percentile_rank_", names(comparison), value = TRUE)
664 if (length(percentile_column_names) > 0) {
665 percentile_replacements <- stats::setNames(
666 rep(0, length(percentile_column_names)),
667 percentile_column_names
668 )
669 }
670
Marc Kupietz28a29842025-10-18 12:25:09 +0200671 collapse_label_values <- function(indices, safe_labels_vec) {
672 if (length(indices) == 0) {
673 return(NA_character_)
674 }
675 labs <- label_map[safe_labels_vec[indices]]
676 fallback <- safe_labels_vec[indices]
677 labs[is.na(labs) | labs == ""] <- fallback[is.na(labs) | labs == ""]
678 labs <- labs[!is.na(labs) & labs != ""]
679 if (length(labs) == 0) {
680 return(NA_character_)
681 }
682 paste(unique(labs), collapse = ", ")
683 }
684
Marc Kupietzc4540a22025-10-14 17:39:53 +0200685 if (length(labels) == 2) {
Marc Kupietz9894a372025-10-18 14:51:29 +0200686 fill_scores <- function(x, y, measure_col) {
687 replacement <- score_replacements[[measure_col]]
688 fallback_min <- suppressWarnings(min(c(x, y), na.rm = TRUE))
689 if (!is.finite(fallback_min)) {
690 fallback_min <- 0
Marc Kupietzc4540a22025-10-14 17:39:53 +0200691 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200692 if (!is.null(replacement) && is.finite(replacement)) {
693 replacement <- min(replacement, fallback_min)
694 } else {
695 replacement <- fallback_min
696 }
697 if (!is.finite(replacement)) {
698 replacement <- 0
699 }
700 if (any(is.na(x))) {
701 x[is.na(x)] <- replacement
702 }
703 if (any(is.na(y))) {
704 y[is.na(y)] <- replacement
705 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200706 list(x = x, y = y)
707 }
708
Marc Kupietz130a2a22025-10-18 16:09:23 +0200709 fill_percentiles <- function(x, y, left_pct_col, right_pct_col) {
710 replacement_left <- percentile_replacements[[left_pct_col]]
711 if (is.null(replacement_left) || !is.finite(replacement_left)) {
712 replacement_left <- 0
713 }
714 replacement_right <- percentile_replacements[[right_pct_col]]
715 if (is.null(replacement_right) || !is.finite(replacement_right)) {
716 replacement_right <- 0
717 }
718 if (any(is.na(x))) {
719 x[is.na(x)] <- replacement_left
720 }
721 if (any(is.na(y))) {
722 y[is.na(y)] <- replacement_right
723 }
724 list(x = x, y = y)
725 }
726
Marc Kupietz28a29842025-10-18 12:25:09 +0200727 fill_ranks <- function(x, y, left_rank_col, right_rank_col) {
728 fallback <- nrow(comparison) + 1
729 replacement_left <- rank_replacements[[left_rank_col]]
730 if (is.null(replacement_left) || !is.finite(replacement_left)) {
731 replacement_left <- fallback
Marc Kupietzc4540a22025-10-14 17:39:53 +0200732 }
Marc Kupietz28a29842025-10-18 12:25:09 +0200733 replacement_right <- rank_replacements[[right_rank_col]]
734 if (is.null(replacement_right) || !is.finite(replacement_right)) {
735 replacement_right <- fallback
736 }
737 if (any(is.na(x))) {
738 x[is.na(x)] <- replacement_left
739 }
740 if (any(is.na(y))) {
741 y[is.na(y)] <- replacement_right
742 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200743 list(x = x, y = y)
744 }
745
746 left_label <- labels[1]
747 right_label <- labels[2]
748
749 for (col in score_cols) {
750 left_col <- paste0(col, "_", left_label)
751 right_col <- paste0(col, "_", right_label)
752 if (!all(c(left_col, right_col) %in% names(comparison))) {
753 next
754 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200755 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]], col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200756 comparison[[left_col]] <- filled$x
757 comparison[[right_col]] <- filled$y
Marc Kupietzc4540a22025-10-14 17:39:53 +0200758 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
Marc Kupietz28a29842025-10-18 12:25:09 +0200759 rank_left <- paste0("rank_", left_label, "_", col)
760 rank_right <- paste0("rank_", right_label, "_", col)
761 if (all(c(rank_left, rank_right) %in% names(comparison))) {
762 filled_rank <- fill_ranks(
763 comparison[[rank_left]],
764 comparison[[rank_right]],
765 rank_left,
766 rank_right
767 )
768 comparison[[paste0("delta_rank_", col)]] <- filled_rank$x - filled_rank$y
769 }
Marc Kupietz130a2a22025-10-18 16:09:23 +0200770 pct_left <- paste0("percentile_rank_", left_label, "_", col)
771 pct_right <- paste0("percentile_rank_", right_label, "_", col)
772 if (all(c(pct_left, pct_right) %in% names(comparison))) {
773 filled_pct <- fill_percentiles(
774 comparison[[pct_left]],
775 comparison[[pct_right]],
776 pct_left,
777 pct_right
778 )
779 comparison[[paste0("delta_percentile_rank_", col)]] <- filled_pct$x - filled_pct$y
780 }
Marc Kupietzc4540a22025-10-14 17:39:53 +0200781 }
782 }
783
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200784 for (col in score_cols) {
785 value_cols <- paste0(col, "_", labels)
786 existing <- value_cols %in% names(comparison)
787 if (!any(existing)) {
788 next
789 }
790 value_cols <- value_cols[existing]
791 safe_labels <- labels[existing]
792
793 score_values <- comparison[, value_cols, drop = FALSE]
794
795 winner_label_col <- paste0("winner_", col)
796 winner_value_col <- paste0("winner_", col, "_value")
797 runner_label_col <- paste0("runner_up_", col)
798 runner_value_col <- paste0("runner_up_", col, "_value")
Marc Kupietzb2862d42025-10-18 10:17:49 +0200799 loser_label_col <- paste0("loser_", col)
800 loser_value_col <- paste0("loser_", col, "_value")
801 max_delta_col <- paste0("max_delta_", col)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200802
803 if (nrow(score_values) == 0) {
804 comparison[[winner_label_col]] <- character(0)
805 comparison[[winner_value_col]] <- numeric(0)
806 comparison[[runner_label_col]] <- character(0)
807 comparison[[runner_value_col]] <- numeric(0)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200808 comparison[[loser_label_col]] <- character(0)
809 comparison[[loser_value_col]] <- numeric(0)
810 comparison[[max_delta_col]] <- numeric(0)
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200811 next
812 }
813
814 score_matrix <- as.matrix(score_values)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200815 storage.mode(score_matrix) <- "numeric"
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200816
Marc Kupietzb2862d42025-10-18 10:17:49 +0200817 n_rows <- nrow(score_matrix)
818 winner_labels <- rep(NA_character_, n_rows)
819 winner_values <- rep(NA_real_, n_rows)
820 runner_labels <- rep(NA_character_, n_rows)
821 runner_values <- rep(NA_real_, n_rows)
822 loser_labels <- rep(NA_character_, n_rows)
823 loser_values <- rep(NA_real_, n_rows)
824 max_deltas <- rep(NA_real_, n_rows)
825
Marc Kupietzb2862d42025-10-18 10:17:49 +0200826 if (n_rows > 0) {
827 for (i in seq_len(n_rows)) {
828 numeric_row <- as.numeric(score_matrix[i, ])
829 if (all(is.na(numeric_row))) {
830 next
831 }
832
Marc Kupietz9894a372025-10-18 14:51:29 +0200833 replacement <- score_replacements[[col]]
834 fallback_min <- suppressWarnings(min(numeric_row, na.rm = TRUE))
835 if (!is.finite(fallback_min)) {
836 fallback_min <- 0
Marc Kupietzb2862d42025-10-18 10:17:49 +0200837 }
Marc Kupietz9894a372025-10-18 14:51:29 +0200838 if (!is.null(replacement) && is.finite(replacement)) {
839 replacement <- min(replacement, fallback_min)
840 } else {
841 replacement <- fallback_min
842 }
843 if (!is.finite(replacement)) {
844 replacement <- 0
845 }
846 if (any(is.na(numeric_row))) {
847 numeric_row[is.na(numeric_row)] <- replacement
848 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200849 score_matrix[i, ] <- numeric_row
850
851 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
852 max_idx <- which(numeric_row == max_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200853 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200854 winner_values[i] <- max_val
855
856 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
857 if (length(unique_vals) >= 2) {
858 runner_val <- unique_vals[2]
859 runner_idx <- which(numeric_row == runner_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200860 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200861 runner_values[i] <- runner_val
862 }
863
864 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
865 min_idx <- which(numeric_row == min_val)
Marc Kupietz28a29842025-10-18 12:25:09 +0200866 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
Marc Kupietzb2862d42025-10-18 10:17:49 +0200867 loser_values[i] <- min_val
868
869 if (is.finite(max_val) && is.finite(min_val)) {
870 max_deltas[i] <- max_val - min_val
871 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200872 }
Marc Kupietzb2862d42025-10-18 10:17:49 +0200873 }
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200874
Marc Kupietzb2862d42025-10-18 10:17:49 +0200875 comparison[, value_cols] <- score_matrix
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200876 comparison[[winner_label_col]] <- winner_labels
877 comparison[[winner_value_col]] <- winner_values
878 comparison[[runner_label_col]] <- runner_labels
879 comparison[[runner_value_col]] <- runner_values
Marc Kupietzb2862d42025-10-18 10:17:49 +0200880 comparison[[loser_label_col]] <- loser_labels
881 comparison[[loser_value_col]] <- loser_values
882 comparison[[max_delta_col]] <- max_deltas
Marc Kupietz5e35d7a2025-10-17 21:21:22 +0200883 }
884
Marc Kupietz28a29842025-10-18 12:25:09 +0200885 for (col in score_cols) {
886 rank_cols <- paste0("rank_", labels, "_", col)
887 existing <- rank_cols %in% names(comparison)
888 if (!any(existing)) {
889 next
890 }
891 rank_cols <- rank_cols[existing]
892 safe_labels <- labels[existing]
893 rank_values <- comparison[, rank_cols, drop = FALSE]
894
895 winner_rank_label_col <- paste0("winner_rank_", col)
896 winner_rank_value_col <- paste0("winner_rank_", col, "_value")
897 runner_rank_label_col <- paste0("runner_up_rank_", col)
898 runner_rank_value_col <- paste0("runner_up_rank_", col, "_value")
899 loser_rank_label_col <- paste0("loser_rank_", col)
900 loser_rank_value_col <- paste0("loser_rank_", col, "_value")
901 max_delta_rank_col <- paste0("max_delta_rank_", col)
902
903 if (nrow(rank_values) == 0) {
904 comparison[[winner_rank_label_col]] <- character(0)
905 comparison[[winner_rank_value_col]] <- numeric(0)
906 comparison[[runner_rank_label_col]] <- character(0)
907 comparison[[runner_rank_value_col]] <- numeric(0)
908 comparison[[loser_rank_label_col]] <- character(0)
909 comparison[[loser_rank_value_col]] <- numeric(0)
910 comparison[[max_delta_rank_col]] <- numeric(0)
911 next
912 }
913
914 rank_matrix <- as.matrix(rank_values)
915 storage.mode(rank_matrix) <- "numeric"
916
917 n_rows <- nrow(rank_matrix)
918 winner_labels <- rep(NA_character_, n_rows)
919 winner_values <- rep(NA_real_, n_rows)
920 runner_labels <- rep(NA_character_, n_rows)
921 runner_values <- rep(NA_real_, n_rows)
922 loser_labels <- rep(NA_character_, n_rows)
923 loser_values <- rep(NA_real_, n_rows)
924 max_deltas <- rep(NA_real_, n_rows)
925
926 for (i in seq_len(n_rows)) {
927 numeric_row <- as.numeric(rank_matrix[i, ])
928 if (all(is.na(numeric_row))) {
929 next
930 }
931
932 if (length(rank_cols) > 0) {
933 replacement_vec <- rank_replacements[rank_cols]
934 replacement_vec[is.na(replacement_vec)] <- nrow(comparison) + 1
935 missing_idx <- which(is.na(numeric_row))
936 if (length(missing_idx) > 0) {
937 numeric_row[missing_idx] <- replacement_vec[missing_idx]
938 }
939 }
940
941 valid_idx <- seq_along(numeric_row)
942 valid_values <- numeric_row[valid_idx]
943 min_val <- suppressWarnings(min(valid_values, na.rm = TRUE))
944 min_positions <- valid_idx[which(valid_values == min_val)]
945 winner_labels[i] <- collapse_label_values(min_positions, safe_labels)
946 winner_values[i] <- min_val
947
948 ordered_vals <- sort(unique(valid_values), decreasing = FALSE)
949 if (length(ordered_vals) >= 2) {
950 runner_val <- ordered_vals[2]
951 runner_positions <- valid_idx[which(valid_values == runner_val)]
952 runner_labels[i] <- collapse_label_values(runner_positions, safe_labels)
953 runner_values[i] <- runner_val
954 }
955
956 max_val <- suppressWarnings(max(valid_values, na.rm = TRUE))
957 max_positions <- valid_idx[which(valid_values == max_val)]
958 loser_labels[i] <- collapse_label_values(max_positions, safe_labels)
959 loser_values[i] <- max_val
960
961 if (is.finite(max_val) && is.finite(min_val)) {
962 max_deltas[i] <- max_val - min_val
963 }
964 }
965
966 comparison[[winner_rank_label_col]] <- winner_labels
967 comparison[[winner_rank_value_col]] <- winner_values
968 comparison[[runner_rank_label_col]] <- runner_labels
969 comparison[[runner_rank_value_col]] <- runner_values
970 comparison[[loser_rank_label_col]] <- loser_labels
971 comparison[[loser_rank_value_col]] <- loser_values
972 comparison[[max_delta_rank_col]] <- max_deltas
973 }
974
Marc Kupietz130a2a22025-10-18 16:09:23 +0200975 for (col in score_cols) {
976 pct_cols <- paste0("percentile_rank_", labels, "_", col)
977 existing <- pct_cols %in% names(comparison)
978 if (!any(existing)) {
979 next
980 }
981 pct_cols <- pct_cols[existing]
982 safe_labels <- labels[existing]
983 pct_values <- comparison[, pct_cols, drop = FALSE]
984
985 winner_pct_label_col <- paste0("winner_percentile_rank_", col)
986 winner_pct_value_col <- paste0("winner_percentile_rank_", col, "_value")
987 runner_pct_label_col <- paste0("runner_up_percentile_rank_", col)
988 runner_pct_value_col <- paste0("runner_up_percentile_rank_", col, "_value")
989 loser_pct_label_col <- paste0("loser_percentile_rank_", col)
990 loser_pct_value_col <- paste0("loser_percentile_rank_", col, "_value")
991 max_delta_pct_col <- paste0("max_delta_percentile_rank_", col)
992
993 if (nrow(pct_values) == 0) {
994 comparison[[winner_pct_label_col]] <- character(0)
995 comparison[[winner_pct_value_col]] <- numeric(0)
996 comparison[[runner_pct_label_col]] <- character(0)
997 comparison[[runner_pct_value_col]] <- numeric(0)
998 comparison[[loser_pct_label_col]] <- character(0)
999 comparison[[loser_pct_value_col]] <- numeric(0)
1000 comparison[[max_delta_pct_col]] <- numeric(0)
1001 next
1002 }
1003
1004 pct_matrix <- as.matrix(pct_values)
1005 storage.mode(pct_matrix) <- "numeric"
1006
1007 n_rows <- nrow(pct_matrix)
1008 winner_labels <- rep(NA_character_, n_rows)
1009 winner_values <- rep(NA_real_, n_rows)
1010 runner_labels <- rep(NA_character_, n_rows)
1011 runner_values <- rep(NA_real_, n_rows)
1012 loser_labels <- rep(NA_character_, n_rows)
1013 loser_values <- rep(NA_real_, n_rows)
1014 max_deltas <- rep(NA_real_, n_rows)
1015
1016 if (n_rows > 0) {
1017 for (i in seq_len(n_rows)) {
1018 numeric_row <- as.numeric(pct_matrix[i, ])
1019 if (all(is.na(numeric_row))) {
1020 next
1021 }
1022
1023 if (any(is.na(numeric_row))) {
1024 numeric_row[is.na(numeric_row)] <- 0
1025 }
1026 pct_matrix[i, ] <- numeric_row
1027
1028 max_val <- suppressWarnings(max(numeric_row, na.rm = TRUE))
1029 max_idx <- which(numeric_row == max_val)
1030 winner_labels[i] <- collapse_label_values(max_idx, safe_labels)
1031 winner_values[i] <- max_val
1032
1033 unique_vals <- sort(unique(numeric_row), decreasing = TRUE)
1034 if (length(unique_vals) >= 2) {
1035 runner_val <- unique_vals[2]
1036 runner_idx <- which(numeric_row == runner_val)
1037 runner_labels[i] <- collapse_label_values(runner_idx, safe_labels)
1038 runner_values[i] <- runner_val
1039 }
1040
1041 min_val <- suppressWarnings(min(numeric_row, na.rm = TRUE))
1042 min_idx <- which(numeric_row == min_val)
1043 loser_labels[i] <- collapse_label_values(min_idx, safe_labels)
1044 loser_values[i] <- min_val
1045
1046 if (is.finite(max_val) && is.finite(min_val)) {
1047 max_deltas[i] <- max_val - min_val
1048 }
1049 }
1050 }
1051
1052 comparison[, pct_cols] <- pct_matrix
1053 comparison[[winner_pct_label_col]] <- winner_labels
1054 comparison[[winner_pct_value_col]] <- winner_values
1055 comparison[[runner_pct_label_col]] <- runner_labels
1056 comparison[[runner_pct_value_col]] <- runner_values
1057 comparison[[loser_pct_label_col]] <- loser_labels
1058 comparison[[loser_pct_value_col]] <- loser_values
1059 comparison[[max_delta_pct_col]] <- max_deltas
1060 }
1061
Marc Kupietzc4540a22025-10-14 17:39:53 +02001062 dplyr::left_join(result, comparison, by = c("node", "collocate"))
1063}
1064
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001065#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +02001066#' @importFrom stringr str_detect
1067#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1068#'
1069matches2FreqTable <- function(matches,
1070 index = 0,
1071 minOccur = 5,
1072 leftContextSize = 5,
1073 rightContextSize = 5,
1074 ignoreCollocateCase = FALSE,
1075 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001076 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +02001077 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1078 verbose = TRUE) {
1079 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1080 frequency <- NULL
1081
1082 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001083 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +02001084 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001085 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001086 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001087 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
1088 ignoreCollocateCase = ignoreCollocateCase,
1089 stopwords = stopwords, oldTable = oldTable, verbose = verbose
1090 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001091 }
1092 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +02001093 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001094 oldTable <- matches2FreqTable(
1095 matches,
1096 i,
1097 leftContextSize = leftContextSize,
1098 rightContextSize = rightContextSize,
1099 collocateFilterRegex = collocateFilterRegex,
1100 oldTable = oldTable,
1101 stopwords = stopwords
1102 )
1103 }
1104 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001105 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001106 group_by(word) |>
1107 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001108 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001109 arrange(desc(frequency))
1110 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001111 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +02001112
1113 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
1114
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001115 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001116
1117 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
1118
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001119 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +02001120
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001121 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +02001122 oldTable
1123 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001124 table(c(left, right)) |>
1125 dplyr::as_tibble(.name_repair = "minimal") |>
1126 dplyr::rename(word = 1, frequency = 2) |>
1127 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001128 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +02001129 dplyr::bind_rows(oldTable)
1130 }
1131 }
1132}
1133
1134#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001135#' @importFrom stringr str_match str_split str_detect
1136#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
1137#'
1138snippet2FreqTable <- function(snippet,
1139 minOccur = 5,
1140 leftContextSize = 5,
1141 rightContextSize = 5,
1142 ignoreCollocateCase = FALSE,
1143 stopwords = c(),
1144 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001145 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001146 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
1147 verbose = TRUE) {
1148 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
1149 frequency <- NULL
1150
1151 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001152 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001153 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +02001154 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001155 for (s in snippet) {
1156 oldTable <- snippet2FreqTable(
1157 s,
1158 leftContextSize = leftContextSize,
1159 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +01001160 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001161 oldTable = oldTable,
1162 stopwords = stopwords
1163 )
1164 }
Marc Kupietza47d1502023-04-18 15:26:47 +02001165 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001166 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001167 group_by(word) |>
1168 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001169 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001170 arrange(desc(frequency))
1171 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001172 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001173 match <-
1174 str_match(
1175 snippet,
1176 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
1177 )
1178
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001179 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001180 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001181 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001182 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001183 }
1184 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001185
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001186 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001187 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001188 } else {
1189 ""
1190 }
1191 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001192
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001193 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001194 oldTable
1195 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001196 table(c(left, right)) |>
1197 dplyr::as_tibble(.name_repair = "minimal") |>
1198 dplyr::rename(word = 1, frequency = 2) |>
1199 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001200 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001201 dplyr::bind_rows(oldTable)
1202 }
1203 }
1204}
1205
1206#' Preliminary synsemantic stopwords function
1207#'
1208#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02001209#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001210#'
1211#' Preliminary synsemantic stopwords function to be used in collocation analysis.
1212#'
1213#' @details
1214#' Currently only suitable for German. See stopwords package for other languages.
1215#'
1216#' @param ... future arguments for language detection
1217#'
1218#' @family collocation analysis functions
1219#' @return Vector of synsemantic stopwords.
1220#' @export
1221synsemanticStopwords <- function(...) {
Marc Kupietzc79155b2025-10-19 13:42:55 +02001222 base <- c(
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001223 "der",
1224 "die",
1225 "und",
1226 "in",
1227 "den",
1228 "von",
1229 "mit",
1230 "das",
1231 "zu",
1232 "im",
1233 "ist",
1234 "auf",
1235 "sich",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001236 "des",
1237 "dem",
1238 "nicht",
1239 "ein",
1240 "eine",
1241 "es",
1242 "auch",
1243 "an",
1244 "als",
1245 "am",
1246 "aus",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001247 "bei",
1248 "er",
1249 "dass",
1250 "sie",
1251 "nach",
1252 "um",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001253 "zum",
1254 "noch",
1255 "war",
1256 "einen",
1257 "einer",
1258 "wie",
1259 "einem",
1260 "vor",
1261 "bis",
1262 "\u00fcber",
1263 "so",
1264 "aber",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001265 "diese",
Marc Kupietzc79155b2025-10-19 13:42:55 +02001266 "oder"
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001267 )
Marc Kupietzc79155b2025-10-19 13:42:55 +02001268
1269 lower <- unique(tolower(base))
1270 capitalized <- paste0(toupper(substr(lower, 1, 1)), substring(lower, 2))
1271
1272 unique(c(lower, capitalized))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001273}
1274
Marc Kupietz5a336b62021-11-27 17:51:35 +01001275
Marc Kupietz76b05592021-12-19 16:26:15 +01001276# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +01001277findExample <-
1278 function(kco,
1279 query,
1280 vc = "",
1281 matchOnly = TRUE) {
1282 out <- character(length = length(query))
1283
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001284 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +01001285 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001286 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001287
1288 for (i in seq_along(query)) {
1289 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001290 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001291 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001292 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001293 out[i] <- if (matchOnly) {
1294 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +01001295 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001296 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +01001297 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001298 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001299 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +01001300 }
Marc Kupietz5a336b62021-11-27 17:51:35 +01001301 }
1302 out
1303 }
1304
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001305collocatesQuery <-
1306 function(kco,
1307 query,
1308 vc = "",
1309 minOccur = 5,
1310 leftContextSize = 5,
1311 rightContextSize = 5,
1312 searchHitsSampleLimit = 20000,
1313 ignoreCollocateCase = FALSE,
1314 stopwords = c(),
Marc Kupietzb2862d42025-10-18 10:17:49 +02001315 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001316 ...) {
1317 frequency <- NULL
1318 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001319 if (q@totalResults == 0) {
1320 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001321 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001322 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
1323 matches2FreqTable(q@collectedMatches,
1324 0,
1325 minOccur = minOccur,
1326 leftContextSize = leftContextSize,
1327 rightContextSize = rightContextSize,
1328 ignoreCollocateCase = ignoreCollocateCase,
1329 stopwords = stopwords,
Marc Kupietzb2862d42025-10-18 10:17:49 +02001330 collocateFilterRegex = collocateFilterRegex,
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001331 ...,
1332 verbose = kco@verbose
1333 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +01001334 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001335 filter(frequency >= minOccur)
1336 }
1337 }