blob: bbd017dfc9d6c62d0e97f35cb44bda44c21e3bb7 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietzc4540a22025-10-14 17:39:53 +020045#' @param multiVcMissingScoreFactor factor that is multiplied with the minimum observed score when imputing missing scores for delta computations between virtual corpora
Marc Kupietz67edcb52021-09-20 21:54:24 +020046#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020047#' @inheritParams collocationScoreQuery,KorAPConnection-method
48#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
49#'
Marc Kupietzc4540a22025-10-14 17:39:53 +020050#' @importFrom dplyr arrange desc slice_head bind_rows group_by mutate ungroup left_join select row_number all_of first
Marc Kupietzdbd431a2021-08-29 12:17:45 +020051#' @importFrom purrr pmap
Marc Kupietzc4540a22025-10-14 17:39:53 +020052#' @importFrom tidyr expand_grid pivot_wider
53#' @importFrom rlang sym
Marc Kupietzdbd431a2021-08-29 12:17:45 +020054#'
55#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020056#' \dontrun{
57#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020058#' # Find top collocates of "Packung" inside and outside the sports domain.
59#' KorAPConnection(verbose = TRUE) |>
60#' collocationAnalysis("Packung",
61#' vc = c("textClass=sport", "textClass!=sport"),
62#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
63#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' dplyr::filter(logDice >= 5)
65#' }
66#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020067#' \dontrun{
68#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020069#' # Identify the most prominent light verb construction with "in ... setzen".
70#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010071#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020072#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020073#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
74#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020075#' }
76#'
77#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020078setMethod(
79 "collocationAnalysis", "KorAPConnection",
80 function(kco,
81 node,
82 vc = "",
83 lemmatizeNodeQuery = FALSE,
84 minOccur = 5,
85 leftContextSize = 5,
86 rightContextSize = 5,
87 topCollocatesLimit = 200,
88 searchHitsSampleLimit = 20000,
89 ignoreCollocateCase = FALSE,
90 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
91 exactFrequencies = TRUE,
92 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
93 seed = 7,
94 expand = length(vc) != length(node),
95 maxRecurse = 0,
96 addExamples = FALSE,
97 thresholdScore = "logDice",
98 threshold = 2.0,
99 localStopwords = c(),
100 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzc4540a22025-10-14 17:39:53 +0200101 multiVcMissingScoreFactor = 0.9,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200102 ...) {
103 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
104 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200105
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200106 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
107 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
108 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200109
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200110 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200111
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200112 if (lemmatizeNodeQuery) {
113 node <- lemmatizeWordQuery(node)
114 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200115
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200116 result <- if (length(node) > 1 || length(vc) > 1) {
117 grid <- if (expand) expand_grid(node = node, vc = vc) else tibble(node = node, vc = vc)
118 purrr::pmap(grid, function(node, vc, ...) {
119 collocationAnalysis(kco,
120 node = node,
121 vc = vc,
122 minOccur = minOccur,
123 leftContextSize = leftContextSize,
124 rightContextSize = rightContextSize,
125 topCollocatesLimit = topCollocatesLimit,
126 searchHitsSampleLimit = searchHitsSampleLimit,
127 ignoreCollocateCase = ignoreCollocateCase,
128 withinSpan = withinSpan,
129 exactFrequencies = exactFrequencies,
130 stopwords = stopwords,
131 addExamples = TRUE,
132 localStopwords = localStopwords,
133 seed = seed,
134 expand = expand,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200135 multiVcMissingScoreFactor = multiVcMissingScoreFactor,
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200136 ...
137 )
138 }) |>
Marc Kupietz4a2fee02025-10-13 13:27:09 +0200139 bind_rows() |>
Marc Kupietzc4540a22025-10-14 17:39:53 +0200140 mutate(label = queryStringToLabel(vc)) |>
141 add_multi_vc_comparisons(thresholdScore = thresholdScore, missingScoreFactor = multiVcMissingScoreFactor)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200142 } else {
143 set.seed(seed)
144 candidates <- collocatesQuery(
145 kco,
146 node,
147 vc = vc,
148 minOccur = minOccur,
149 leftContextSize = leftContextSize,
150 rightContextSize = rightContextSize,
151 searchHitsSampleLimit = searchHitsSampleLimit,
152 ignoreCollocateCase = ignoreCollocateCase,
153 stopwords = append(stopwords, localStopwords),
154 ...
155 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200156
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200157 if (nrow(candidates) > 0) {
158 candidates <- candidates |>
159 filter(frequency >= minOccur) |>
160 slice_head(n = topCollocatesLimit)
161 collocationScoreQuery(
162 kco,
163 node = node,
164 collocate = candidates$word,
165 vc = vc,
166 leftContextSize = leftContextSize,
167 rightContextSize = rightContextSize,
168 observed = if (exactFrequencies) NA else candidates$frequency,
169 ignoreCollocateCase = ignoreCollocateCase,
170 withinSpan = withinSpan,
171 ...
172 ) |>
173 filter(O >= minOccur) |>
174 dplyr::arrange(dplyr::desc(logDice))
175 } else {
176 tibble()
177 }
178 }
179 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
180 recurseWith <- result |>
181 filter(!!as.name(thresholdScore) >= threshold)
182 result <- collocationAnalysis(
183 kco,
184 node = paste0("(", buildCollocationQuery(
185 removeWithinSpan(recurseWith$node, withinSpan),
186 recurseWith$collocate,
187 leftContextSize = leftContextSize,
188 rightContextSize = rightContextSize,
189 withinSpan = ""
190 ), ")"),
191 vc = vc,
192 minOccur = minOccur,
193 leftContextSize = leftContextSize,
194 rightContextSize = rightContextSize,
195 withinSpan = withinSpan,
196 maxRecurse = maxRecurse - 1,
197 stopwords = stopwords,
198 localStopwords = recurseWith$collocate,
199 exactFrequencies = exactFrequencies,
200 searchHitsSampleLimit = searchHitsSampleLimit,
201 topCollocatesLimit = topCollocatesLimit,
Marc Kupietzc4540a22025-10-14 17:39:53 +0200202 addExamples = FALSE,
203 multiVcMissingScoreFactor = multiVcMissingScoreFactor
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200204 ) |>
205 bind_rows(result) |>
206 filter(logDice >= 2) |>
Marc Kupietze9e18bd2025-06-04 17:15:02 +0200207 filter(O >= minOccur) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200208 dplyr::arrange(dplyr::desc(logDice))
209 }
210 if (addExamples && length(result) > 0) {
211 result$query <- buildCollocationQuery(
212 result$node,
213 result$collocate,
214 leftContextSize = leftContextSize,
215 rightContextSize = rightContextSize,
216 withinSpan = withinSpan
217 )
218 result$example <- findExample(
219 kco,
220 query = result$query,
221 vc = result$vc
222 )
223 }
224 result
225 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200226)
227
Marc Kupietz76b05592021-12-19 16:26:15 +0100228# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100229removeWithinSpan <- function(query, withinSpan) {
230 if (withinSpan == "") {
231 return(query)
232 }
233 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200234 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100235 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200236 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100237 return(res)
238}
239
Marc Kupietzc4540a22025-10-14 17:39:53 +0200240add_multi_vc_comparisons <- function(result, thresholdScore, missingScoreFactor) {
241 label <- node <- collocate <- rankWithinLabel <- NULL
242
243 if (!"label" %in% names(result) || dplyr::n_distinct(result$label) < 2) {
244 return(result)
245 }
246
247 numeric_cols <- names(result)[vapply(result, is.numeric, logical(1))]
248 non_score_cols <- c("N", "O", "O1", "O2", "E", "w", "leftContextSize", "rightContextSize", "frequency")
249 score_cols <- setdiff(numeric_cols, non_score_cols)
250
251 if (length(score_cols) == 0) {
252 return(result)
253 }
254
255 ranking_col <- thresholdScore
256 if (is.null(ranking_col) || is.na(ranking_col) || !ranking_col %in% score_cols) {
257 ranking_col <- if ("logDice" %in% score_cols) "logDice" else score_cols[1]
258 }
259
260 ranking_sym <- rlang::sym(ranking_col)
261
262 result <- result |>
263 dplyr::group_by(label) |>
264 dplyr::mutate(rankWithinLabel = dplyr::row_number(dplyr::desc(!!ranking_sym))) |>
265 dplyr::ungroup()
266
267 comparison <- result |>
268 dplyr::select(node, collocate, label, rankWithinLabel, dplyr::all_of(score_cols)) |>
269 pivot_wider(
270 names_from = label,
271 values_from = c(rankWithinLabel, dplyr::all_of(score_cols)),
272 names_glue = "{.value}_{make.names(label)}",
273 values_fn = dplyr::first
274 )
275
276 labels <- make.names(unique(result$label))
277
278 if (length(labels) == 2) {
279 fill_scores <- function(x, y) {
280 min_val <- suppressWarnings(min(c(x, y), na.rm = TRUE))
281 if (!is.finite(min_val)) {
282 min_val <- 0
283 }
284 x[is.na(x)] <- missingScoreFactor * min_val
285 y[is.na(y)] <- missingScoreFactor * min_val
286 list(x = x, y = y)
287 }
288
289 fill_ranks <- function(x, y) {
290 max_val <- suppressWarnings(max(c(x, y), na.rm = TRUE))
291 if (!is.finite(max_val)) {
292 max_val <- 0
293 }
294 x[is.na(x)] <- max_val + 1
295 y[is.na(y)] <- max_val + 1
296 list(x = x, y = y)
297 }
298
299 left_label <- labels[1]
300 right_label <- labels[2]
301
302 for (col in score_cols) {
303 left_col <- paste0(col, "_", left_label)
304 right_col <- paste0(col, "_", right_label)
305 if (!all(c(left_col, right_col) %in% names(comparison))) {
306 next
307 }
308 filled <- fill_scores(comparison[[left_col]], comparison[[right_col]])
309 comparison[[paste0("delta_", col)]] <- filled$x - filled$y
310 }
311
312 left_rank <- paste0("rankWithinLabel_", left_label)
313 right_rank <- paste0("rankWithinLabel_", right_label)
314 if (all(c(left_rank, right_rank) %in% names(comparison))) {
315 filled_rank <- fill_ranks(comparison[[left_rank]], comparison[[right_rank]])
316 comparison[["delta_rankWithinLabel"]] <- filled_rank$x - filled_rank$y
317 }
318 }
319
320 dplyr::left_join(result, comparison, by = c("node", "collocate"))
321}
322
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200323#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200324#' @importFrom stringr str_detect
325#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
326#'
327matches2FreqTable <- function(matches,
328 index = 0,
329 minOccur = 5,
330 leftContextSize = 5,
331 rightContextSize = 5,
332 ignoreCollocateCase = FALSE,
333 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200334 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200335 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
336 verbose = TRUE) {
337 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
338 frequency <- NULL
339
340 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200341 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200342 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200343 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200344 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200345 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
346 ignoreCollocateCase = ignoreCollocateCase,
347 stopwords = stopwords, oldTable = oldTable, verbose = verbose
348 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200349 }
350 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200351 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200352 oldTable <- matches2FreqTable(
353 matches,
354 i,
355 leftContextSize = leftContextSize,
356 rightContextSize = rightContextSize,
357 collocateFilterRegex = collocateFilterRegex,
358 oldTable = oldTable,
359 stopwords = stopwords
360 )
361 }
362 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200363 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100364 group_by(word) |>
365 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200366 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200367 arrange(desc(frequency))
368 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200369 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200370
371 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
372
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200373 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200374
375 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
376
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200377 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200378
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200379 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200380 oldTable
381 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100382 table(c(left, right)) |>
383 dplyr::as_tibble(.name_repair = "minimal") |>
384 dplyr::rename(word = 1, frequency = 2) |>
385 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200386 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200387 dplyr::bind_rows(oldTable)
388 }
389 }
390}
391
392#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200393#' @importFrom stringr str_match str_split str_detect
394#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
395#'
396snippet2FreqTable <- function(snippet,
397 minOccur = 5,
398 leftContextSize = 5,
399 rightContextSize = 5,
400 ignoreCollocateCase = FALSE,
401 stopwords = c(),
402 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200403 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200404 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
405 verbose = TRUE) {
406 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
407 frequency <- NULL
408
409 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200410 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200411 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200412 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200413 for (s in snippet) {
414 oldTable <- snippet2FreqTable(
415 s,
416 leftContextSize = leftContextSize,
417 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100418 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200419 oldTable = oldTable,
420 stopwords = stopwords
421 )
422 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200423 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200424 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100425 group_by(word) |>
426 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200427 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200428 arrange(desc(frequency))
429 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200430 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200431 match <-
432 str_match(
433 snippet,
434 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
435 )
436
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200437 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200438 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200439 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200440 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200441 }
442 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200443
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200444 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200445 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200446 } else {
447 ""
448 }
449 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200450
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200451 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200452 oldTable
453 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100454 table(c(left, right)) |>
455 dplyr::as_tibble(.name_repair = "minimal") |>
456 dplyr::rename(word = 1, frequency = 2) |>
457 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200458 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200459 dplyr::bind_rows(oldTable)
460 }
461 }
462}
463
464#' Preliminary synsemantic stopwords function
465#'
466#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200467#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200468#'
469#' Preliminary synsemantic stopwords function to be used in collocation analysis.
470#'
471#' @details
472#' Currently only suitable for German. See stopwords package for other languages.
473#'
474#' @param ... future arguments for language detection
475#'
476#' @family collocation analysis functions
477#' @return Vector of synsemantic stopwords.
478#' @export
479synsemanticStopwords <- function(...) {
480 res <- c(
481 "der",
482 "die",
483 "und",
484 "in",
485 "den",
486 "von",
487 "mit",
488 "das",
489 "zu",
490 "im",
491 "ist",
492 "auf",
493 "sich",
494 "Die",
495 "des",
496 "dem",
497 "nicht",
498 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100499 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200500 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100501 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200502 "es",
503 "auch",
504 "an",
505 "als",
506 "am",
507 "aus",
508 "Der",
509 "bei",
510 "er",
511 "dass",
512 "sie",
513 "nach",
514 "um",
515 "Das",
516 "zum",
517 "noch",
518 "war",
519 "einen",
520 "einer",
521 "wie",
522 "einem",
523 "vor",
524 "bis",
525 "\u00fcber",
526 "so",
527 "aber",
528 "Eine",
529 "diese",
530 "Diese",
531 "oder"
532 )
533 return(res)
534}
535
Marc Kupietz5a336b62021-11-27 17:51:35 +0100536
Marc Kupietz76b05592021-12-19 16:26:15 +0100537# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100538findExample <-
539 function(kco,
540 query,
541 vc = "",
542 matchOnly = TRUE) {
543 out <- character(length = length(query))
544
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200545 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100546 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200547 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100548
549 for (i in seq_along(query)) {
550 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100551 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200552 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100553 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200554 out[i] <- if (matchOnly) {
555 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100556 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200557 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100558 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100559 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200560 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100561 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100562 }
563 out
564 }
565
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200566collocatesQuery <-
567 function(kco,
568 query,
569 vc = "",
570 minOccur = 5,
571 leftContextSize = 5,
572 rightContextSize = 5,
573 searchHitsSampleLimit = 20000,
574 ignoreCollocateCase = FALSE,
575 stopwords = c(),
576 ...) {
577 frequency <- NULL
578 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200579 if (q@totalResults == 0) {
580 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200581 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200582 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
583 matches2FreqTable(q@collectedMatches,
584 0,
585 minOccur = minOccur,
586 leftContextSize = leftContextSize,
587 rightContextSize = rightContextSize,
588 ignoreCollocateCase = ignoreCollocateCase,
589 stopwords = stopwords,
590 ...,
591 verbose = kco@verbose
592 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100593 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200594 filter(frequency >= minOccur)
595 }
596 }