blob: 4f58396e3cf99a7d9a2516415261d208eb677e50 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietze34a8be2025-10-17 20:13:42 +020046#' @param vcLabel optional label override for the current virtual corpus (used internally when named VC collections are expanded)
Marc Kupietz67edcb52021-09-20 21:54:24 +020047#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @inheritParams collocationScoreQuery,KorAPConnection-method
49#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
50#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020051#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020052#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020053#' @importFrom tidyr expand_grid pivot_wider
54#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#'
56#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020059#' # Find top collocates of "Packung" inside and outside the sports domain.
60#' KorAPConnection(verbose = TRUE) |>
61#' collocationAnalysis("Packung",
62#' vc = c("textClass=sport", "textClass!=sport"),
63#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
64#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020065#' dplyr::filter(logDice >= 5)
66#' }
67#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020068#' \dontrun{
69#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' # Identify the most prominent light verb construction with "in ... setzen".
71#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010072#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020074#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
75#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020076#' }
77#'
78#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020079setMethod(
80 "collocationAnalysis", "KorAPConnection",
81 function(kco,
82 node,
83 vc = "",
84 lemmatizeNodeQuery = FALSE,
85 minOccur = 5,
86 leftContextSize = 5,
87 rightContextSize = 5,
88 topCollocatesLimit = 200,
89 searchHitsSampleLimit = 20000,
90 ignoreCollocateCase = FALSE,
91 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
92 exactFrequencies = TRUE,
93 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
94 seed = 7,
95 expand = length(vc) != length(node),
96 maxRecurse = 0,
97 addExamples = FALSE,
98 thresholdScore = "logDice",
99 threshold = 2.0,
100 localStopwords = c(),
101 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200102 multiVcMissingScoreFactor = 0.9,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200103 vcLabel = NA_character_,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200104 ...) {
105 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
Marc Kupietze34a8be2025-10-17 20:13:42 +0200106 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200107
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200108 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
109 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
110 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200111
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200112 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200113
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200114 if (lemmatizeNodeQuery) {
115 node <- lemmatizeWordQuery(node)
116 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200117
Marc Kupietze34a8be2025-10-17 20:13:42 +0200118 vcNames <- names(vc)
119 vc <- unname(unlist(vc, use.names = FALSE))
120 if (is.null(vcNames)) {
121 vcNames <- rep(NA_character_, length(vc))
122 } else {
123 vcNames[vcNames == ""] <- NA_character_
124 if (length(vcNames) < length(vc)) {
125 vcNames <- rep(vcNames, length.out = length(vc))
126 }
127 }
128
129 label_lookup <- NULL
130 if (length(vc) > 0 && any(!is.na(vcNames))) {
131 valid_lookup <- !is.na(vcNames)
132 label_lookup <- vcNames[valid_lookup]
133 names(label_lookup) <- vc[valid_lookup]
134 }
135
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200136 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200137 grid <- if (expand) {
138 tmp_grid <- expand_grid(node = node, idx = seq_along(vc))
139 tmp_grid$vc <- vc[tmp_grid$idx]
140 tmp_grid$vcLabel <- vcNames[tmp_grid$idx]
141 tmp_grid[, setdiff(names(tmp_grid), "idx"), drop = FALSE]
142 } else {
143 tibble(node = node, vc = vc, vcLabel = vcNames)
144 }
145
146 multi_result <- purrr::pmap(grid, function(node, vc, vcLabel, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200147 collocationAnalysis(kco,
148 node = node,
149 vc = vc,
150 minOccur = minOccur,
151 leftContextSize = leftContextSize,
152 rightContextSize = rightContextSize,
153 topCollocatesLimit = topCollocatesLimit,
154 searchHitsSampleLimit = searchHitsSampleLimit,
155 ignoreCollocateCase = ignoreCollocateCase,
156 withinSpan = withinSpan,
157 exactFrequencies = exactFrequencies,
158 stopwords = stopwords,
159 addExamples = TRUE,
160 localStopwords = localStopwords,
161 seed = seed,
162 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200163 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200164 vcLabel = vcLabel,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200165 ...
166 )
167 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200168 bind_rows()
169
170 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
171 multi_result
172 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200173 if (!"label" %in% names(multi_result)) {
174 multi_result$label <- NA_character_
175 }
176
177 if (!is.null(label_lookup)) {
178 override <- unname(label_lookup[multi_result$vc])
179 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
180 if (any(missing_idx)) {
181 multi_result$label[missing_idx] <- override[missing_idx]
182 }
183 }
184
185 missing_idx <- is.na(multi_result$label) | multi_result$label == ""
186 if (any(missing_idx)) {
187 multi_result$label[missing_idx] <- queryStringToLabel(multi_result$vc[missing_idx])
188 }
189
Marc Kupietze31322e2025-10-17 18:55:36 +0200190 multi_result |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200191 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
192 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200193 } else {
Marc Kupietze34a8be2025-10-17 20:13:42 +0200194 if ((is.na(vcLabel) || vcLabel == "") && length(vcNames) >= 1) {
195 vcLabel <- vcNames[1]
196 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200197 set.seed(seed)
198 candidates <- collocatesQuery(
199 kco,
200 node,
201 vc = vc,
202 minOccur = minOccur,
203 leftContextSize = leftContextSize,
204 rightContextSize = rightContextSize,
205 searchHitsSampleLimit = searchHitsSampleLimit,
206 ignoreCollocateCase = ignoreCollocateCase,
207 stopwords = append(stopwords, localStopwords),
208 ...
209 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200210
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200211 if (nrow(candidates) > 0) {
212 candidates <- candidates |>
213 filter(frequency >= minOccur) |>
214 slice_head(n = topCollocatesLimit)
215 collocationScoreQuery(
216 kco,
217 node = node,
218 collocate = candidates$word,
219 vc = vc,
220 leftContextSize = leftContextSize,
221 rightContextSize = rightContextSize,
222 observed = if (exactFrequencies) NA else candidates$frequency,
223 ignoreCollocateCase = ignoreCollocateCase,
224 withinSpan = withinSpan,
225 ...
226 ) |>
227 filter(O >= minOccur) |>
228 dplyr::arrange(dplyr::desc(logDice))
229 } else {
230 tibble()
231 }
232 }
Marc Kupietze34a8be2025-10-17 20:13:42 +0200233 if (!is.na(vcLabel) && vcLabel != "" && "label" %in% names(result)) {
234 result$label <- rep(vcLabel, nrow(result))
235 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200236 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
237 recurseWith <- result |>
238 filter(!!as.name(thresholdScore) >= threshold)
239 result <- collocationAnalysis(
240 kco,
241 node = paste0("(", buildCollocationQuery(
242 removeWithinSpan(recurseWith$node, withinSpan),
243 recurseWith$collocate,
244 leftContextSize = leftContextSize,
245 rightContextSize = rightContextSize,
246 withinSpan = ""
247 ), ")"),
248 vc = vc,
249 minOccur = minOccur,
250 leftContextSize = leftContextSize,
251 rightContextSize = rightContextSize,
252 withinSpan = withinSpan,
253 maxRecurse = maxRecurse - 1,
254 stopwords = stopwords,
255 localStopwords = recurseWith$collocate,
256 exactFrequencies = exactFrequencies,
257 searchHitsSampleLimit = searchHitsSampleLimit,
258 topCollocatesLimit = topCollocatesLimit,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200259 addExamples = FALSE,
Marc Kupietze34a8be2025-10-17 20:13:42 +0200260 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
261 vcLabel = vcLabel
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200262 ) |>
263 bind_rows(result) |>
264 filter(logDice >= 2) |>
Marc Kupietze9e18bd2025-06-04 17:15:02 +0200265 filter(O >= minOccur) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200266 dplyr::arrange(dplyr::desc(logDice))
267 }
268 if (addExamples && length(result) > 0) {
269 result$query <- buildCollocationQuery(
270 result$node,
271 result$collocate,
272 leftContextSize = leftContextSize,
273 rightContextSize = rightContextSize,
274 withinSpan = withinSpan
275 )
276 result$example <- findExample(
277 kco,
278 query = result$query,
279 vc = result$vc
280 )
281 }
282 result
283 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200284)
285
Marc Kupietz76b05592021-12-19 16:26:15 +0100286# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100287removeWithinSpan <- function(query, withinSpan) {
288 if (withinSpan == "") {
289 return(query)
290 }
291 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200292 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100293 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200294 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100295 return(res)
296}
297
Marc Kupietzc4540a22025-10-14 17:39:53 +0200298add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
299 label <- node <- collocate <- rankWithinLabel <- NULL
300
301 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
302 return(result)
303 }
304
305 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
306 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
307 score_cols <- setdiff(numeric_cols, non_score_cols)
308
309 if (length(score_cols) == 0) {
310 return(result)
311 }
312
313 ranking_col <- thresholdScore
314 if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
315 ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
316 }
317
318 ranking_sym <- rlang::sym(ranking_col)
319
320 result <- result |>
321 dplyr::group_by(label) |>
322 dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
323 dplyr::ungroup()
324
325 comparison <- result |>
326 dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
327 pivot_wider(
328 names_from = label,
329 values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
330 names_glue = "{.value}_{make.names(label)}",
331 values_fn = dplyr::first
332 )
333
334 labels <- make.names(unique(result$label))
335
336 if (length(labels) == 2) {
337 fill_scores <- function(x, y) {
338 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
339 if (!is.finite(min_val)) {
340 min_val <- 0
341 }
342 x[is.na(x)] <- missingScoreFactor * min_val
343 y[is.na(y)] <- missingScoreFactor * min_val
344 list(x = x, y = y)
345 }
346
347 fill_ranks <- function(x, y) {
348 max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
349 if (!is.finite(max_val)) {
350 max_val <- 0
351 }
352 x[is.na(x)] <- max_val + 1
353 y[is.na(y)] <- max_val + 1
354 list(x = x, y = y)
355 }
356
357 left_label <- labels[1]
358 right_label <- labels[2]
359
360 for (col in score_cols) {
361 left_col <- paste0(col, "_", left_label)
362 right_col <- paste0(col, "_", right_label)
363 if (!all(c(left_col, right_col) %in% names(comparison))) {
364 next
365 }
366 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
367 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
368 }
369
370 left_rank <- paste0("rankWithinLabel_", left_label)
371 right_rank <- paste0("rankWithinLabel_", right_label)
372 if (all(c(left_rank, right_rank) %in% names(comparison))) {
373 filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
374 comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
375 }
376 }
377
378 dplyr::left_join(result, comparison, by = c("node", "collocate"))
379}
380
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200381#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200382#' @importFrom stringr str_detect
383#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
384#'
385matches2FreqTable <- function(matches,
386 index = 0,
387 minOccur = 5,
388 leftContextSize = 5,
389 rightContextSize = 5,
390 ignoreCollocateCase = FALSE,
391 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200392 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200393 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
394 verbose = TRUE) {
395 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
396 frequency <- NULL
397
398 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200399 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200400 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200401 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200402 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200403 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
404 ignoreCollocateCase = ignoreCollocateCase,
405 stopwords = stopwords, oldTable = oldTable, verbose = verbose
406 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200407 }
408 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200409 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200410 oldTable <- matches2FreqTable(
411 matches,
412 i,
413 leftContextSize = leftContextSize,
414 rightContextSize = rightContextSize,
415 collocateFilterRegex = collocateFilterRegex,
416 oldTable = oldTable,
417 stopwords = stopwords
418 )
419 }
420 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200421 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100422 group_by(word) |>
423 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200424 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200425 arrange(desc(frequency))
426 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200427 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200428
429 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
430
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200431 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200432
433 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
434
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200435 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200436
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200437 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200438 oldTable
439 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100440 table(c(left, right)) |>
441 dplyr::as_tibble(.name_repair = "minimal") |>
442 dplyr::rename(word = 1, frequency = 2) |>
443 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200444 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200445 dplyr::bind_rows(oldTable)
446 }
447 }
448}
449
450#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200451#' @importFrom stringr str_match str_split str_detect
452#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
453#'
454snippet2FreqTable <- function(snippet,
455 minOccur = 5,
456 leftContextSize = 5,
457 rightContextSize = 5,
458 ignoreCollocateCase = FALSE,
459 stopwords = c(),
460 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200461 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200462 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
463 verbose = TRUE) {
464 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
465 frequency <- NULL
466
467 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200468 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200469 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200470 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200471 for (s in snippet) {
472 oldTable <- snippet2FreqTable(
473 s,
474 leftContextSize = leftContextSize,
475 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100476 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200477 oldTable = oldTable,
478 stopwords = stopwords
479 )
480 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200481 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200482 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100483 group_by(word) |>
484 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200485 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200486 arrange(desc(frequency))
487 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200488 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200489 match <-
490 str_match(
491 snippet,
492 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
493 )
494
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200495 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200496 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200497 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200498 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200499 }
500 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200501
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200502 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200503 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200504 } else {
505 ""
506 }
507 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200508
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200509 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200510 oldTable
511 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100512 table(c(left, right)) |>
513 dplyr::as_tibble(.name_repair = "minimal") |>
514 dplyr::rename(word = 1, frequency = 2) |>
515 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200516 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200517 dplyr::bind_rows(oldTable)
518 }
519 }
520}
521
522#' Preliminary synsemantic stopwords function
523#'
524#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200525#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200526#'
527#' Preliminary synsemantic stopwords function to be used in collocation analysis.
528#'
529#' @details
530#' Currently only suitable for German. See stopwords package for other languages.
531#'
532#' @param ... future arguments for language detection
533#'
534#' @family collocation analysis functions
535#' @return Vector of synsemantic stopwords.
536#' @export
537synsemanticStopwords <- function(...) {
538 res <- c(
539 "der",
540 "die",
541 "und",
542 "in",
543 "den",
544 "von",
545 "mit",
546 "das",
547 "zu",
548 "im",
549 "ist",
550 "auf",
551 "sich",
552 "Die",
553 "des",
554 "dem",
555 "nicht",
556 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100557 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200558 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100559 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200560 "es",
561 "auch",
562 "an",
563 "als",
564 "am",
565 "aus",
566 "Der",
567 "bei",
568 "er",
569 "dass",
570 "sie",
571 "nach",
572 "um",
573 "Das",
574 "zum",
575 "noch",
576 "war",
577 "einen",
578 "einer",
579 "wie",
580 "einem",
581 "vor",
582 "bis",
583 "\u00fcber",
584 "so",
585 "aber",
586 "Eine",
587 "diese",
588 "Diese",
589 "oder"
590 )
591 return(res)
592}
593
Marc Kupietz5a336b62021-11-27 17:51:35 +0100594
Marc Kupietz76b05592021-12-19 16:26:15 +0100595# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100596findExample <-
597 function(kco,
598 query,
599 vc = "",
600 matchOnly = TRUE) {
601 out <- character(length = length(query))
602
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200603 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100604 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200605 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100606
607 for (i in seq_along(query)) {
608 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100609 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200610 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100611 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200612 out[i] <- if (matchOnly) {
613 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100614 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200615 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100616 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100617 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200618 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100619 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100620 }
621 out
622 }
623
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200624collocatesQuery <-
625 function(kco,
626 query,
627 vc = "",
628 minOccur = 5,
629 leftContextSize = 5,
630 rightContextSize = 5,
631 searchHitsSampleLimit = 20000,
632 ignoreCollocateCase = FALSE,
633 stopwords = c(),
634 ...) {
635 frequency <- NULL
636 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200637 if (q@totalResults == 0) {
638 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200639 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200640 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
641 matches2FreqTable(q@collectedMatches,
642 0,
643 minOccur = minOccur,
644 leftContextSize = leftContextSize,
645 rightContextSize = rightContextSize,
646 ignoreCollocateCase = ignoreCollocateCase,
647 stopwords = stopwords,
648 ...,
649 verbose = kco@verbose
650 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100651 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200652 filter(frequency >= minOccur)
653 }
654 }