blob: 0bcd691791b4128ebb917cd2549fb76636ca3b50 [file] [log] [blame]
Marc Kupietz6dfeed92025-06-03 11:58:06 +02001#' @include logging.R
2setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +02003
4#' Collocation analysis
5#'
Marc Kupietza8c40f42025-06-24 15:49:52 +02006#' @family collocation analysis functions
Marc Kupietzdbd431a2021-08-29 12:17:45 +02007#' @aliases collocationAnalysis
8#'
9#' @description
Marc Kupietzdbd431a2021-08-29 12:17:45 +020010#'
11#' Performs a collocation analysis for the given node (or query)
12#' in the given virtual corpus.
13#'
14#' @details
15#' The collocation analysis is currently implemented on the client side, as some of the
16#' functionality is not yet provided by the KorAP backend. Mainly for this reason
17#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
18#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
19#' and look for expression-internal collocates using the focus function (see examples and demo).
20#'
21#' To increase speed at the cost of accuracy and possible false negatives,
22#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
23#'
Marc Kupietze7f0d682025-02-19 10:50:59 +010024#' Note that some outdated non-DeReKo back-ends might not yet support returning tokenized matches (warning issued).
25#' In this case, the client library will fall back to client-side tokenization which might be slightly less accurate.
26#' This might lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
Marc Kupietzdbd431a2021-08-29 12:17:45 +020027#' user interface.
28#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020029#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020030#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020031#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
32#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
33#' @param searchHitsSampleLimit limit the size of the search hits sample
34#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010035#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020036#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
37#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010039#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
40#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
41#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
42#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
43#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010044#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020045#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020046#' @inheritParams collocationScoreQuery,KorAPConnection-method
47#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
48#'
Marc Kupietz2b17b212023-08-27 17:47:26 +020049#' @importFrom dplyr arrange desc slice_head bind_rows
Marc Kupietzdbd431a2021-08-29 12:17:45 +020050#' @importFrom purrr pmap
51#' @importFrom tidyr expand_grid
52#'
53#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020054#' \dontrun{
55#'
Marc Kupietz6dfeed92025-06-03 11:58:06 +020056#' # Find top collocates of "Packung" inside and outside the sports domain.
57#' KorAPConnection(verbose = TRUE) |>
58#' collocationAnalysis("Packung",
59#' vc = c("textClass=sport", "textClass!=sport"),
60#' leftContextSize = 1, rightContextSize = 1, topCollocatesLimit = 20
61#' ) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020062#' dplyr::filter(logDice >= 5)
63#' }
64#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020065#' \dontrun{
66#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020067#' # Identify the most prominent light verb construction with "in ... setzen".
68#' # Note that, currently, the use of focus function disallows exactFrequencies.
Marc Kupietz4cd066d2025-02-28 15:48:23 +010069#' KorAPConnection(verbose = TRUE) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +020070#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
Marc Kupietz6dfeed92025-06-03 11:58:06 +020071#' leftContextSize = 1, rightContextSize = 0, exactFrequencies = FALSE, topCollocatesLimit = 20
72#' )
Marc Kupietzdbd431a2021-08-29 12:17:45 +020073#' }
74#'
75#' @export
Marc Kupietz6dfeed92025-06-03 11:58:06 +020076setMethod(
77 "collocationAnalysis", "KorAPConnection",
78 function(kco,
79 node,
80 vc = "",
81 lemmatizeNodeQuery = FALSE,
82 minOccur = 5,
83 leftContextSize = 5,
84 rightContextSize = 5,
85 topCollocatesLimit = 200,
86 searchHitsSampleLimit = 20000,
87 ignoreCollocateCase = FALSE,
88 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
89 exactFrequencies = TRUE,
90 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
91 seed = 7,
92 expand = length(vc) != length(node),
93 maxRecurse = 0,
94 addExamples = FALSE,
95 thresholdScore = "logDice",
96 threshold = 2.0,
97 localStopwords = c(),
98 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
99 ...) {
100 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
101 word <- frequency <- O <- NULL
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200102
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200103 if (!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan) > 0)) {
104 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
105 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200106
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200107 warnIfNotAuthorized(kco)
Marc Kupietz581a29b2021-09-04 20:51:04 +0200108
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200109 if (lemmatizeNodeQuery) {
110 node <- lemmatizeWordQuery(node)
111 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200112
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200113 result <- if (length(node) > 1 || length(vc) > 1) {
114 grid <- if (expand) expand_grid(node = node, vc = vc) else tibble(node = node, vc = vc)
115 purrr::pmap(grid, function(node, vc, ...) {
116 collocationAnalysis(kco,
117 node = node,
118 vc = vc,
119 minOccur = minOccur,
120 leftContextSize = leftContextSize,
121 rightContextSize = rightContextSize,
122 topCollocatesLimit = topCollocatesLimit,
123 searchHitsSampleLimit = searchHitsSampleLimit,
124 ignoreCollocateCase = ignoreCollocateCase,
125 withinSpan = withinSpan,
126 exactFrequencies = exactFrequencies,
127 stopwords = stopwords,
128 addExamples = TRUE,
129 localStopwords = localStopwords,
130 seed = seed,
131 expand = expand,
132 ...
133 )
134 }) |>
Marc Kupietz4a2fee02025-10-13 13:27:09 +0200135 bind_rows() |>
136 mutate(label = queryStringToLabel(vc))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200137 } else {
138 set.seed(seed)
139 candidates <- collocatesQuery(
140 kco,
141 node,
142 vc = vc,
143 minOccur = minOccur,
144 leftContextSize = leftContextSize,
145 rightContextSize = rightContextSize,
146 searchHitsSampleLimit = searchHitsSampleLimit,
147 ignoreCollocateCase = ignoreCollocateCase,
148 stopwords = append(stopwords, localStopwords),
149 ...
150 )
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200151
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200152 if (nrow(candidates) > 0) {
153 candidates <- candidates |>
154 filter(frequency >= minOccur) |>
155 slice_head(n = topCollocatesLimit)
156 collocationScoreQuery(
157 kco,
158 node = node,
159 collocate = candidates$word,
160 vc = vc,
161 leftContextSize = leftContextSize,
162 rightContextSize = rightContextSize,
163 observed = if (exactFrequencies) NA else candidates$frequency,
164 ignoreCollocateCase = ignoreCollocateCase,
165 withinSpan = withinSpan,
166 ...
167 ) |>
168 filter(O >= minOccur) |>
169 dplyr::arrange(dplyr::desc(logDice))
170 } else {
171 tibble()
172 }
173 }
174 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
175 recurseWith <- result |>
176 filter(!!as.name(thresholdScore) >= threshold)
177 result <- collocationAnalysis(
178 kco,
179 node = paste0("(", buildCollocationQuery(
180 removeWithinSpan(recurseWith$node, withinSpan),
181 recurseWith$collocate,
182 leftContextSize = leftContextSize,
183 rightContextSize = rightContextSize,
184 withinSpan = ""
185 ), ")"),
186 vc = vc,
187 minOccur = minOccur,
188 leftContextSize = leftContextSize,
189 rightContextSize = rightContextSize,
190 withinSpan = withinSpan,
191 maxRecurse = maxRecurse - 1,
192 stopwords = stopwords,
193 localStopwords = recurseWith$collocate,
194 exactFrequencies = exactFrequencies,
195 searchHitsSampleLimit = searchHitsSampleLimit,
196 topCollocatesLimit = topCollocatesLimit,
197 addExamples = FALSE
198 ) |>
199 bind_rows(result) |>
200 filter(logDice >= 2) |>
Marc Kupietze9e18bd2025-06-04 17:15:02 +0200201 filter(O >= minOccur) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200202 dplyr::arrange(dplyr::desc(logDice))
203 }
204 if (addExamples && length(result) > 0) {
205 result$query <- buildCollocationQuery(
206 result$node,
207 result$collocate,
208 leftContextSize = leftContextSize,
209 rightContextSize = rightContextSize,
210 withinSpan = withinSpan
211 )
212 result$example <- findExample(
213 kco,
214 query = result$query,
215 vc = result$vc
216 )
217 }
218 result
219 }
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200220)
221
Marc Kupietz76b05592021-12-19 16:26:15 +0100222# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100223removeWithinSpan <- function(query, withinSpan) {
224 if (withinSpan == "") {
225 return(query)
226 }
227 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200228 res <- gsub(needle, "\\1", query)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100229 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200230 res <- gsub(needle, "\\1", res)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100231 return(res)
232}
233
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200234#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200235#' @importFrom stringr str_detect
236#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
237#'
238matches2FreqTable <- function(matches,
239 index = 0,
240 minOccur = 5,
241 leftContextSize = 5,
242 rightContextSize = 5,
243 ignoreCollocateCase = FALSE,
244 stopwords = c(),
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200245 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietz2b17b212023-08-27 17:47:26 +0200246 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
247 verbose = TRUE) {
248 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
249 frequency <- NULL
250
251 if (nrow(matches) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200252 dplyr::tibble(word = c(), frequency = c())
Marc Kupietz2b17b212023-08-27 17:47:26 +0200253 } else if (index == 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200254 if (!"tokens" %in% colnames(matches) || !is.list(matches$tokens)) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200255 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200256 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize,
257 ignoreCollocateCase = ignoreCollocateCase,
258 stopwords = stopwords, oldTable = oldTable, verbose = verbose
259 ))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200260 }
261 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
Marc Kupietza25fbd92025-10-14 17:38:09 +0200262 for (i in seq_len(nrow(matches))) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200263 oldTable <- matches2FreqTable(
264 matches,
265 i,
266 leftContextSize = leftContextSize,
267 rightContextSize = rightContextSize,
268 collocateFilterRegex = collocateFilterRegex,
269 oldTable = oldTable,
270 stopwords = stopwords
271 )
272 }
273 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200274 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100275 group_by(word) |>
276 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200277 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200278 arrange(desc(frequency))
279 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200280 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200281
282 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
283
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200284 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200285
286 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
287
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200288 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietz2b17b212023-08-27 17:47:26 +0200289
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200290 if (length(left) + length(right) == 0) {
Marc Kupietz2b17b212023-08-27 17:47:26 +0200291 oldTable
292 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100293 table(c(left, right)) |>
294 dplyr::as_tibble(.name_repair = "minimal") |>
295 dplyr::rename(word = 1, frequency = 2) |>
296 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200297 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietz2b17b212023-08-27 17:47:26 +0200298 dplyr::bind_rows(oldTable)
299 }
300 }
301}
302
303#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200304#' @importFrom stringr str_match str_split str_detect
305#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
306#'
307snippet2FreqTable <- function(snippet,
308 minOccur = 5,
309 leftContextSize = 5,
310 rightContextSize = 5,
311 ignoreCollocateCase = FALSE,
312 stopwords = c(),
313 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200314 collocateFilterRegex = "^[:alnum:]+-?[:alnum:]*$",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200315 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
316 verbose = TRUE) {
317 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
318 frequency <- NULL
319
320 if (length(snippet) < 1) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200321 dplyr::tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200322 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200323 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200324 for (s in snippet) {
325 oldTable <- snippet2FreqTable(
326 s,
327 leftContextSize = leftContextSize,
328 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100329 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200330 oldTable = oldTable,
331 stopwords = stopwords
332 )
333 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200334 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200335 oldTable |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100336 group_by(word) |>
337 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200338 summarise(frequency = sum(frequency), .groups = "drop") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200339 arrange(desc(frequency))
340 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200341 stopwordsTable <- dplyr::tibble(word = stopwords)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200342 match <-
343 str_match(
344 snippet,
345 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
346 )
347
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200348 left <- if (leftContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200349 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200350 } else {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200351 ""
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200352 }
353 # cat(paste("left:", left, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200354
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200355 right <- if (rightContextSize > 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200356 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200357 } else {
358 ""
359 }
360 # cat(paste("right:", right, "\n", collapse=" "))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200361
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200362 if (is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200363 oldTable
364 } else {
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100365 table(c(left, right)) |>
366 dplyr::as_tibble(.name_repair = "minimal") |>
367 dplyr::rename(word = 1, frequency = 2) |>
368 dplyr::filter(str_detect(word, collocateFilterRegex)) |>
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200369 dplyr::anti_join(stopwordsTable, by = "word") |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200370 dplyr::bind_rows(oldTable)
371 }
372 }
373}
374
375#' Preliminary synsemantic stopwords function
376#'
377#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200378#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200379#'
380#' Preliminary synsemantic stopwords function to be used in collocation analysis.
381#'
382#' @details
383#' Currently only suitable for German. See stopwords package for other languages.
384#'
385#' @param ... future arguments for language detection
386#'
387#' @family collocation analysis functions
388#' @return Vector of synsemantic stopwords.
389#' @export
390synsemanticStopwords <- function(...) {
391 res <- c(
392 "der",
393 "die",
394 "und",
395 "in",
396 "den",
397 "von",
398 "mit",
399 "das",
400 "zu",
401 "im",
402 "ist",
403 "auf",
404 "sich",
405 "Die",
406 "des",
407 "dem",
408 "nicht",
409 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100410 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200411 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100412 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200413 "es",
414 "auch",
415 "an",
416 "als",
417 "am",
418 "aus",
419 "Der",
420 "bei",
421 "er",
422 "dass",
423 "sie",
424 "nach",
425 "um",
426 "Das",
427 "zum",
428 "noch",
429 "war",
430 "einen",
431 "einer",
432 "wie",
433 "einem",
434 "vor",
435 "bis",
436 "\u00fcber",
437 "so",
438 "aber",
439 "Eine",
440 "diese",
441 "Diese",
442 "oder"
443 )
444 return(res)
445}
446
Marc Kupietz5a336b62021-11-27 17:51:35 +0100447
Marc Kupietz76b05592021-12-19 16:26:15 +0100448# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100449findExample <-
450 function(kco,
451 query,
452 vc = "",
453 matchOnly = TRUE) {
454 out <- character(length = length(query))
455
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200456 if (length(vc) < length(query)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100457 vc <- rep(vc, length(query))
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200458 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100459
460 for (i in seq_along(query)) {
461 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100462 if (q@totalResults > 0) {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200463 q <- fetchNext(q, maxFetch = 50, randomizePageOrder = F)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100464 example <- as.character((q@collectedMatches)$snippet[1])
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200465 out[i] <- if (matchOnly) {
466 gsub(".*<mark>(.+)</mark>.*", "\\1", example)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100467 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200468 stringr::str_replace(example, "<[^>]*>", "")
Marc Kupietz5a336b62021-11-27 17:51:35 +0100469 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100470 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200471 out[i] <- ""
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100472 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100473 }
474 out
475 }
476
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200477collocatesQuery <-
478 function(kco,
479 query,
480 vc = "",
481 minOccur = 5,
482 leftContextSize = 5,
483 rightContextSize = 5,
484 searchHitsSampleLimit = 20000,
485 ignoreCollocateCase = FALSE,
486 stopwords = c(),
487 ...) {
488 frequency <- NULL
489 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200490 if (q@totalResults == 0) {
491 tibble(word = c(), frequency = c())
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200492 } else {
Marc Kupietz6dfeed92025-06-03 11:58:06 +0200493 q <- fetchNext(q, maxFetch = searchHitsSampleLimit, randomizePageOrder = TRUE)
494 matches2FreqTable(q@collectedMatches,
495 0,
496 minOccur = minOccur,
497 leftContextSize = leftContextSize,
498 rightContextSize = rightContextSize,
499 ignoreCollocateCase = ignoreCollocateCase,
500 stopwords = stopwords,
501 ...,
502 verbose = kco@verbose
503 ) |>
Marc Kupietz4cd066d2025-02-28 15:48:23 +0100504 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) |>
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200505 filter(frequency >= minOccur)
506 }
507 }