blob: 86cfb23e9d464620d77c1353402b9565fdba1764 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietz67edcb52021-09-20 21:54:24 +020046#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020047#' @inheritParams collocationScoreQuery,KorAPConnection-method
48#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
49#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020050#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020051#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020052#' @importFrom tidyr expand_grid pivot_wider
53#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020054#'
55#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020056#' \dontrun{
57#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020058#' # Find top collocates of "Packung" inside and outside the sports domain.
59#' KorAPConnection(verbose = TRUE) |>
60#' collocationAnalysis("Packung",
61#' vc = c("textClass=sport", "textClass!=sport"),
62#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
63#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' dplyr::filter(logDice >= 5)
65#' }
66#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020067#' \dontrun{
68#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020069#' # Identify the most prominent light verb construction with "in ... setzen".
70#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010071#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020072#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020073#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
74#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020075#' }
76#'
77#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020078setMethod(
79 "collocationAnalysis", "KorAPConnection",
80 function(kco,
81 node,
82 vc = "",
83 lemmatizeNodeQuery = FALSE,
84 minOccur = 5,
85 leftContextSize = 5,
86 rightContextSize = 5,
87 topCollocatesLimit = 200,
88 searchHitsSampleLimit = 20000,
89 ignoreCollocateCase = FALSE,
90 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
91 exactFrequencies = TRUE,
92 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
93 seed = 7,
94 expand = length(vc) != length(node),
95 maxRecurse = 0,
96 addExamples = FALSE,
97 thresholdScore = "logDice",
98 threshold = 2.0,
99 localStopwords = c(),
100 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200101 multiVcMissingScoreFactor = 0.9,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200102 ...) {
103 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
104 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200105
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200106 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
107 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
108 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200109
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200110 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200111
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200112 if (lemmatizeNodeQuery) {
113 node <- lemmatizeWordQuery(node)
114 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200115
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200116 result <- if (length(node) > 1 || length(vc) > 1) {
117 grid <- if (expand) expand_grid(node = node, vc = vc) else tibble(node = node, vc = vc)
Marc Kupietze31322e2025-10-17 18:55:36 +0200118 multi_result <- purrr::pmap(grid, function(node, vc, ...) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200119 collocationAnalysis(kco,
120 node = node,
121 vc = vc,
122 minOccur = minOccur,
123 leftContextSize = leftContextSize,
124 rightContextSize = rightContextSize,
125 topCollocatesLimit = topCollocatesLimit,
126 searchHitsSampleLimit = searchHitsSampleLimit,
127 ignoreCollocateCase = ignoreCollocateCase,
128 withinSpan = withinSpan,
129 exactFrequencies = exactFrequencies,
130 stopwords = stopwords,
131 addExamples = TRUE,
132 localStopwords = localStopwords,
133 seed = seed,
134 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200135 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200136 ...
137 )
138 }) |>
Marc Kupietze31322e2025-10-17 18:55:36 +0200139 bind_rows()
140
141 if (!"vc" %in% names(multi_result) || nrow(multi_result) == 0) {
142 multi_result
143 } else {
144 multi_result |>
145 mutate(label = queryStringToLabel(.data$vc)) |>
146 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
147 }
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200148 } else {
149 set.seed(seed)
150 candidates <- collocatesQuery(
151 kco,
152 node,
153 vc = vc,
154 minOccur = minOccur,
155 leftContextSize = leftContextSize,
156 rightContextSize = rightContextSize,
157 searchHitsSampleLimit = searchHitsSampleLimit,
158 ignoreCollocateCase = ignoreCollocateCase,
159 stopwords = append(stopwords, localStopwords),
160 ...
161 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200162
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200163 if (nrow(candidates) > 0) {
164 candidates <- candidates |>
165 filter(frequency >= minOccur) |>
166 slice_head(n = topCollocatesLimit)
167 collocationScoreQuery(
168 kco,
169 node = node,
170 collocate = candidates$word,
171 vc = vc,
172 leftContextSize = leftContextSize,
173 rightContextSize = rightContextSize,
174 observed = if (exactFrequencies) NA else candidates$frequency,
175 ignoreCollocateCase = ignoreCollocateCase,
176 withinSpan = withinSpan,
177 ...
178 ) |>
179 filter(O >= minOccur) |>
180 dplyr::arrange(dplyr::desc(logDice))
181 } else {
182 tibble()
183 }
184 }
185 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
186 recurseWith <- result |>
187 filter(!!as.name(thresholdScore) >= threshold)
188 result <- collocationAnalysis(
189 kco,
190 node = paste0("(", buildCollocationQuery(
191 removeWithinSpan(recurseWith$node, withinSpan),
192 recurseWith$collocate,
193 leftContextSize = leftContextSize,
194 rightContextSize = rightContextSize,
195 withinSpan = ""
196 ), ")"),
197 vc = vc,
198 minOccur = minOccur,
199 leftContextSize = leftContextSize,
200 rightContextSize = rightContextSize,
201 withinSpan = withinSpan,
202 maxRecurse = maxRecurse - 1,
203 stopwords = stopwords,
204 localStopwords = recurseWith$collocate,
205 exactFrequencies = exactFrequencies,
206 searchHitsSampleLimit = searchHitsSampleLimit,
207 topCollocatesLimit = topCollocatesLimit,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200208 addExamples = FALSE,
209 multiVcMissingScoreFactor = multiVcMissingScoreFactor
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200210 ) |>
211 bind_rows(result) |>
212 filter(logDice >= 2) |>
Marc Kupietze9e18bd2025-06-04 17:15:02 +0200213 filter(O >= minOccur) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200214 dplyr::arrange(dplyr::desc(logDice))
215 }
216 if (addExamples && length(result) > 0) {
217 result$query <- buildCollocationQuery(
218 result$node,
219 result$collocate,
220 leftContextSize = leftContextSize,
221 rightContextSize = rightContextSize,
222 withinSpan = withinSpan
223 )
224 result$example <- findExample(
225 kco,
226 query = result$query,
227 vc = result$vc
228 )
229 }
230 result
231 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200232)
233
Marc Kupietz76b05592021-12-19 16:26:15 +0100234# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100235removeWithinSpan <- function(query, withinSpan) {
236 if (withinSpan == "") {
237 return(query)
238 }
239 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200240 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100241 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200242 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100243 return(res)
244}
245
Marc Kupietzc4540a22025-10-14 17:39:53 +0200246add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
247 label <- node <- collocate <- rankWithinLabel <- NULL
248
249 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
250 return(result)
251 }
252
253 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
254 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
255 score_cols <- setdiff(numeric_cols, non_score_cols)
256
257 if (length(score_cols) == 0) {
258 return(result)
259 }
260
261 ranking_col <- thresholdScore
262 if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
263 ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
264 }
265
266 ranking_sym <- rlang::sym(ranking_col)
267
268 result <- result |>
269 dplyr::group_by(label) |>
270 dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
271 dplyr::ungroup()
272
273 comparison <- result |>
274 dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
275 pivot_wider(
276 names_from = label,
277 values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
278 names_glue = "{.value}_{make.names(label)}",
279 values_fn = dplyr::first
280 )
281
282 labels <- make.names(unique(result$label))
283
284 if (length(labels) == 2) {
285 fill_scores <- function(x, y) {
286 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
287 if (!is.finite(min_val)) {
288 min_val <- 0
289 }
290 x[is.na(x)] <- missingScoreFactor * min_val
291 y[is.na(y)] <- missingScoreFactor * min_val
292 list(x = x, y = y)
293 }
294
295 fill_ranks <- function(x, y) {
296 max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
297 if (!is.finite(max_val)) {
298 max_val <- 0
299 }
300 x[is.na(x)] <- max_val + 1
301 y[is.na(y)] <- max_val + 1
302 list(x = x, y = y)
303 }
304
305 left_label <- labels[1]
306 right_label <- labels[2]
307
308 for (col in score_cols) {
309 left_col <- paste0(col, "_", left_label)
310 right_col <- paste0(col, "_", right_label)
311 if (!all(c(left_col, right_col) %in% names(comparison))) {
312 next
313 }
314 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
315 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
316 }
317
318 left_rank <- paste0("rankWithinLabel_", left_label)
319 right_rank <- paste0("rankWithinLabel_", right_label)
320 if (all(c(left_rank, right_rank) %in% names(comparison))) {
321 filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
322 comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
323 }
324 }
325
326 dplyr::left_join(result, comparison, by = c("node", "collocate"))
327}
328
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200329#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200330#' @importFrom stringr str_detect
331#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
332#'
333matches2FreqTable <- function(matches,
334 index = 0,
335 minOccur = 5,
336 leftContextSize = 5,
337 rightContextSize = 5,
338 ignoreCollocateCase = FALSE,
339 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200340 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200341 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
342 verbose = TRUE) {
343 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
344 frequency <- NULL
345
346 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200347 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200348 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200349 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200350 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200351 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
352 ignoreCollocateCase = ignoreCollocateCase,
353 stopwords = stopwords, oldTable = oldTable, verbose = verbose
354 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200355 }
356 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200357 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200358 oldTable <- matches2FreqTable(
359 matches,
360 i,
361 leftContextSize = leftContextSize,
362 rightContextSize = rightContextSize,
363 collocateFilterRegex = collocateFilterRegex,
364 oldTable = oldTable,
365 stopwords = stopwords
366 )
367 }
368 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200369 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100370 group_by(word) |>
371 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200372 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200373 arrange(desc(frequency))
374 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200375 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200376
377 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
378
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200379 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200380
381 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
382
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200383 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200384
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200385 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200386 oldTable
387 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100388 table(c(left, right)) |>
389 dplyr::as_tibble(.name_repair = "minimal") |>
390 dplyr::rename(word = 1, frequency = 2) |>
391 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200392 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200393 dplyr::bind_rows(oldTable)
394 }
395 }
396}
397
398#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200399#' @importFrom stringr str_match str_split str_detect
400#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
401#'
402snippet2FreqTable <- function(snippet,
403 minOccur = 5,
404 leftContextSize = 5,
405 rightContextSize = 5,
406 ignoreCollocateCase = FALSE,
407 stopwords = c(),
408 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200409 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200410 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
411 verbose = TRUE) {
412 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
413 frequency <- NULL
414
415 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200416 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200417 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200418 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200419 for (s in snippet) {
420 oldTable <- snippet2FreqTable(
421 s,
422 leftContextSize = leftContextSize,
423 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100424 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200425 oldTable = oldTable,
426 stopwords = stopwords
427 )
428 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200429 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200430 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100431 group_by(word) |>
432 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200433 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200434 arrange(desc(frequency))
435 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200436 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200437 match <-
438 str_match(
439 snippet,
440 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
441 )
442
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200443 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200444 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200445 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200446 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200447 }
448 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200449
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200450 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200451 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200452 } else {
453 ""
454 }
455 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200456
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200457 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200458 oldTable
459 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100460 table(c(left, right)) |>
461 dplyr::as_tibble(.name_repair = "minimal") |>
462 dplyr::rename(word = 1, frequency = 2) |>
463 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200464 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200465 dplyr::bind_rows(oldTable)
466 }
467 }
468}
469
470#' Preliminary synsemantic stopwords function
471#'
472#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200473#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200474#'
475#' Preliminary synsemantic stopwords function to be used in collocation analysis.
476#'
477#' @details
478#' Currently only suitable for German. See stopwords package for other languages.
479#'
480#' @param ... future arguments for language detection
481#'
482#' @family collocation analysis functions
483#' @return Vector of synsemantic stopwords.
484#' @export
485synsemanticStopwords <- function(...) {
486 res <- c(
487 "der",
488 "die",
489 "und",
490 "in",
491 "den",
492 "von",
493 "mit",
494 "das",
495 "zu",
496 "im",
497 "ist",
498 "auf",
499 "sich",
500 "Die",
501 "des",
502 "dem",
503 "nicht",
504 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100505 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200506 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100507 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200508 "es",
509 "auch",
510 "an",
511 "als",
512 "am",
513 "aus",
514 "Der",
515 "bei",
516 "er",
517 "dass",
518 "sie",
519 "nach",
520 "um",
521 "Das",
522 "zum",
523 "noch",
524 "war",
525 "einen",
526 "einer",
527 "wie",
528 "einem",
529 "vor",
530 "bis",
531 "\u00fcber",
532 "so",
533 "aber",
534 "Eine",
535 "diese",
536 "Diese",
537 "oder"
538 )
539 return(res)
540}
541
Marc Kupietz5a336b62021-11-27 17:51:35 +0100542
Marc Kupietz76b05592021-12-19 16:26:15 +0100543# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100544findExample <-
545 function(kco,
546 query,
547 vc = "",
548 matchOnly = TRUE) {
549 out <- character(length = length(query))
550
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200551 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100552 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200553 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100554
555 for (i in seq_along(query)) {
556 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100557 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200558 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100559 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200560 out[i] <- if (matchOnly) {
561 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100562 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200563 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100564 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100565 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200566 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100567 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100568 }
569 out
570 }
571
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200572collocatesQuery <-
573 function(kco,
574 query,
575 vc = "",
576 minOccur = 5,
577 leftContextSize = 5,
578 rightContextSize = 5,
579 searchHitsSampleLimit = 20000,
580 ignoreCollocateCase = FALSE,
581 stopwords = c(),
582 ...) {
583 frequency <- NULL
584 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200585 if (q@totalResults == 0) {
586 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200587 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200588 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
589 matches2FreqTable(q@collectedMatches,
590 0,
591 minOccur = minOccur,
592 leftContextSize = leftContextSize,
593 rightContextSize = rightContextSize,
594 ignoreCollocateCase = ignoreCollocateCase,
595 stopwords = stopwords,
596 ...,
597 verbose = kco@verbose
598 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100599 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200600 filter(frequency >= minOccur)
601 }
602 }