blob: 16987c60fd44063ecaff07ae93d7d2c48d6d3553 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
6#' @aliases collocationAnalysis
7#'
8#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010023#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
24#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
25#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020026#' user interface.
27#'
28#' @family collocation analysis functions
29#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020045#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020046#' @inheritParams collocationScoreQuery,KorAPConnection-method
47#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
48#'
Marc Kupietz2b17b212023-08-27 17:47:26 +020049#' @importFrom dplyr arrange desc slice_head bind_rows
Marc Kupietzdbd431a2021-08-29 12:17:45 +020050#' @importFrom purrr pmap
51#' @importFrom tidyr expand_grid
52#'
53#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020054#' \dontrun{
55#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020056#' # Find top collocates of "Packung" inside and outside the sports domain.
57#' KorAPConnection(verbose = TRUE) |>
58#' collocationAnalysis("Packung",
59#' vc = c("textClass=sport", "textClass!=sport"),
60#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
61#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020062#' dplyr::filter(logDice >= 5)
63#' }
64#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020065#' \dontrun{
66#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067#' # Identify the most prominent light verb construction with "in ... setzen".
68#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010069#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020071#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
72#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' }
74#'
75#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020076setMethod(
77 "collocationAnalysis", "KorAPConnection",
78 function(kco,
79 node,
80 vc = "",
81 lemmatizeNodeQuery = FALSE,
82 minOccur = 5,
83 leftContextSize = 5,
84 rightContextSize = 5,
85 topCollocatesLimit = 200,
86 searchHitsSampleLimit = 20000,
87 ignoreCollocateCase = FALSE,
88 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
89 exactFrequencies = TRUE,
90 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
91 seed = 7,
92 expand = length(vc) != length(node),
93 maxRecurse = 0,
94 addExamples = FALSE,
95 thresholdScore = "logDice",
96 threshold = 2.0,
97 localStopwords = c(),
98 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
99 ...) {
100 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
101 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200102
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200103 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
104 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
105 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200106
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200107 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200108
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200109 if (lemmatizeNodeQuery) {
110 node <- lemmatizeWordQuery(node)
111 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200112
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200113 result <- if (length(node) > 1 || length(vc) > 1) {
114 grid <- if (expand) expand_grid(node = node, vc = vc) else tibble(node = node, vc = vc)
115 purrr::pmap(grid, function(node, vc, ...) {
116 collocationAnalysis(kco,
117 node = node,
118 vc = vc,
119 minOccur = minOccur,
120 leftContextSize = leftContextSize,
121 rightContextSize = rightContextSize,
122 topCollocatesLimit = topCollocatesLimit,
123 searchHitsSampleLimit = searchHitsSampleLimit,
124 ignoreCollocateCase = ignoreCollocateCase,
125 withinSpan = withinSpan,
126 exactFrequencies = exactFrequencies,
127 stopwords = stopwords,
128 addExamples = TRUE,
129 localStopwords = localStopwords,
130 seed = seed,
131 expand = expand,
132 ...
133 )
134 }) |>
135 bind_rows()
136 } else {
137 set.seed(seed)
138 candidates <- collocatesQuery(
139 kco,
140 node,
141 vc = vc,
142 minOccur = minOccur,
143 leftContextSize = leftContextSize,
144 rightContextSize = rightContextSize,
145 searchHitsSampleLimit = searchHitsSampleLimit,
146 ignoreCollocateCase = ignoreCollocateCase,
147 stopwords = append(stopwords, localStopwords),
148 ...
149 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200150
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200151 if (nrow(candidates) > 0) {
152 candidates <- candidates |>
153 filter(frequency >= minOccur) |>
154 slice_head(n = topCollocatesLimit)
155 collocationScoreQuery(
156 kco,
157 node = node,
158 collocate = candidates$word,
159 vc = vc,
160 leftContextSize = leftContextSize,
161 rightContextSize = rightContextSize,
162 observed = if (exactFrequencies) NA else candidates$frequency,
163 ignoreCollocateCase = ignoreCollocateCase,
164 withinSpan = withinSpan,
165 ...
166 ) |>
167 filter(O >= minOccur) |>
168 dplyr::arrange(dplyr::desc(logDice))
169 } else {
170 tibble()
171 }
172 }
173 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
174 recurseWith <- result |>
175 filter(!!as.name(thresholdScore) >= threshold)
176 result <- collocationAnalysis(
177 kco,
178 node = paste0("(", buildCollocationQuery(
179 removeWithinSpan(recurseWith$node, withinSpan),
180 recurseWith$collocate,
181 leftContextSize = leftContextSize,
182 rightContextSize = rightContextSize,
183 withinSpan = ""
184 ), ")"),
185 vc = vc,
186 minOccur = minOccur,
187 leftContextSize = leftContextSize,
188 rightContextSize = rightContextSize,
189 withinSpan = withinSpan,
190 maxRecurse = maxRecurse - 1,
191 stopwords = stopwords,
192 localStopwords = recurseWith$collocate,
193 exactFrequencies = exactFrequencies,
194 searchHitsSampleLimit = searchHitsSampleLimit,
195 topCollocatesLimit = topCollocatesLimit,
196 addExamples = FALSE
197 ) |>
198 bind_rows(result) |>
199 filter(logDice >= 2) |>
200 filter(.$O >= minOccur) |>
201 dplyr::arrange(dplyr::desc(logDice))
202 }
203 if (addExamples && length(result) > 0) {
204 result$query <- buildCollocationQuery(
205 result$node,
206 result$collocate,
207 leftContextSize = leftContextSize,
208 rightContextSize = rightContextSize,
209 withinSpan = withinSpan
210 )
211 result$example <- findExample(
212 kco,
213 query = result$query,
214 vc = result$vc
215 )
216 }
217 result
218 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200219)
220
Marc Kupietz76b05592021-12-19 16:26:15 +0100221# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100222removeWithinSpan <- function(query, withinSpan) {
223 if (withinSpan == "") {
224 return(query)
225 }
226 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200227 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100228 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200229 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100230 return(res)
231}
232
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200233#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200234#' @importFrom stringr str_detect
235#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
236#'
237matches2FreqTable <- function(matches,
238 index = 0,
239 minOccur = 5,
240 leftContextSize = 5,
241 rightContextSize = 5,
242 ignoreCollocateCase = FALSE,
243 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200244 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200245 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
246 verbose = TRUE) {
247 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
248 frequency <- NULL
249
250 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200251 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200252 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200253 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200254 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200255 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
256 ignoreCollocateCase = ignoreCollocateCase,
257 stopwords = stopwords, oldTable = oldTable, verbose = verbose
258 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200259 }
260 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
261 for (i in 1:nrow(matches)) {
262 oldTable <- matches2FreqTable(
263 matches,
264 i,
265 leftContextSize = leftContextSize,
266 rightContextSize = rightContextSize,
267 collocateFilterRegex = collocateFilterRegex,
268 oldTable = oldTable,
269 stopwords = stopwords
270 )
271 }
272 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200273 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100274 group_by(word) |>
275 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200276 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200277 arrange(desc(frequency))
278 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200279 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200280
281 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
282
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200283 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200284
285 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
286
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200287 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200288
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200289 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200290 oldTable
291 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100292 table(c(left, right)) |>
293 dplyr::as_tibble(.name_repair = "minimal") |>
294 dplyr::rename(word = 1, frequency = 2) |>
295 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200296 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200297 dplyr::bind_rows(oldTable)
298 }
299 }
300}
301
302#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200303#' @importFrom stringr str_match str_split str_detect
304#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
305#'
306snippet2FreqTable <- function(snippet,
307 minOccur = 5,
308 leftContextSize = 5,
309 rightContextSize = 5,
310 ignoreCollocateCase = FALSE,
311 stopwords = c(),
312 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200313 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200314 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
315 verbose = TRUE) {
316 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
317 frequency <- NULL
318
319 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200320 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200321 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200322 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200323 for (s in snippet) {
324 oldTable <- snippet2FreqTable(
325 s,
326 leftContextSize = leftContextSize,
327 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100328 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200329 oldTable = oldTable,
330 stopwords = stopwords
331 )
332 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200333 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200334 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100335 group_by(word) |>
336 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200337 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200338 arrange(desc(frequency))
339 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200340 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200341 match <-
342 str_match(
343 snippet,
344 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
345 )
346
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200347 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200348 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200349 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200350 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200351 }
352 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200353
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200354 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200355 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200356 } else {
357 ""
358 }
359 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200360
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200361 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200362 oldTable
363 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100364 table(c(left, right)) |>
365 dplyr::as_tibble(.name_repair = "minimal") |>
366 dplyr::rename(word = 1, frequency = 2) |>
367 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200368 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200369 dplyr::bind_rows(oldTable)
370 }
371 }
372}
373
374#' Preliminary synsemantic stopwords function
375#'
376#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200377#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200378#'
379#' Preliminary synsemantic stopwords function to be used in collocation analysis.
380#'
381#' @details
382#' Currently only suitable for German. See stopwords package for other languages.
383#'
384#' @param ... future arguments for language detection
385#'
386#' @family collocation analysis functions
387#' @return Vector of synsemantic stopwords.
388#' @export
389synsemanticStopwords <- function(...) {
390 res <- c(
391 "der",
392 "die",
393 "und",
394 "in",
395 "den",
396 "von",
397 "mit",
398 "das",
399 "zu",
400 "im",
401 "ist",
402 "auf",
403 "sich",
404 "Die",
405 "des",
406 "dem",
407 "nicht",
408 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100409 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200410 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100411 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200412 "es",
413 "auch",
414 "an",
415 "als",
416 "am",
417 "aus",
418 "Der",
419 "bei",
420 "er",
421 "dass",
422 "sie",
423 "nach",
424 "um",
425 "Das",
426 "zum",
427 "noch",
428 "war",
429 "einen",
430 "einer",
431 "wie",
432 "einem",
433 "vor",
434 "bis",
435 "\u00fcber",
436 "so",
437 "aber",
438 "Eine",
439 "diese",
440 "Diese",
441 "oder"
442 )
443 return(res)
444}
445
Marc Kupietz5a336b62021-11-27 17:51:35 +0100446
Marc Kupietz76b05592021-12-19 16:26:15 +0100447# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100448findExample <-
449 function(kco,
450 query,
451 vc = "",
452 matchOnly = TRUE) {
453 out <- character(length = length(query))
454
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200455 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100456 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200457 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100458
459 for (i in seq_along(query)) {
460 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100461 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200462 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100463 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200464 out[i] <- if (matchOnly) {
465 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100466 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200467 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100468 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100469 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200470 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100471 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100472 }
473 out
474 }
475
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200476collocatesQuery <-
477 function(kco,
478 query,
479 vc = "",
480 minOccur = 5,
481 leftContextSize = 5,
482 rightContextSize = 5,
483 searchHitsSampleLimit = 20000,
484 ignoreCollocateCase = FALSE,
485 stopwords = c(),
486 ...) {
487 frequency <- NULL
488 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200489 if (q@totalResults == 0) {
490 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200491 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200492 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
493 matches2FreqTable(q@collectedMatches,
494 0,
495 minOccur = minOccur,
496 leftContextSize = leftContextSize,
497 rightContextSize = rightContextSize,
498 ignoreCollocateCase = ignoreCollocateCase,
499 stopwords = stopwords,
500 ...,
501 verbose = kco@verbose
502 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100503 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200504 filter(frequency >= minOccur)
505 }
506 }