blob: b79a2b37b017a4c72741e752ff3ea0521fb16f85 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
Marc Kupietz6bd9cad2024-12-18 15:57:26 +010034#' @param withinSpan KorAP span specification (see <https://korap.ids-mannheim.de/doc/ql/poliqarp-plus?embedded=true#spans>) for collocations to be searched within. Defaults to `base/s=s`.
Marc Kupietzdbd431a2021-08-29 12:17:45 +020035#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
36#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020037#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010038#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
39#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
40#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
41#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
42#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010043#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020044#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020045#' @inheritParams collocationScoreQuery,KorAPConnection-method
46#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
47#'
Marc Kupietz2b17b212023-08-27 17:47:26 +020048#' @importFrom dplyr arrange desc slice_head bind_rows
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' @importFrom purrr pmap
50#' @importFrom tidyr expand_grid
51#'
52#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020053#' \dontrun{
54#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#' # Find top collocates of "Packung" inside and outside the sports domain.
56#' new("KorAPConnection", verbose = TRUE) %>%
57#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
58#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
59#' dplyr::filter(logDice >= 5)
60#' }
61#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020062#' \dontrun{
63#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' # Identify the most prominent light verb construction with "in ... setzen".
65#' # Note that, currently, the use of focus function disallows exactFrequencies.
66#' new("KorAPConnection", verbose = TRUE) %>%
67#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
68#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
69#' }
70#'
71#' @export
72setMethod("collocationAnalysis", "KorAPConnection",
73 function(kco,
74 node,
75 vc = "",
76 lemmatizeNodeQuery = FALSE,
77 minOccur = 5,
78 leftContextSize = 5,
79 rightContextSize = 5,
80 topCollocatesLimit = 200,
81 searchHitsSampleLimit = 20000,
82 ignoreCollocateCase = FALSE,
83 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
84 exactFrequencies = TRUE,
Marc Kupietz6505ccf2021-11-27 17:46:25 +010085 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086 seed = 7,
87 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010088 maxRecurse = 0,
Marc Kupietzdadfd912021-12-22 12:48:20 +010089 addExamples = FALSE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010090 thresholdScore = "logDice",
91 threshold = 2.0,
Marc Kupietz5a336b62021-11-27 17:51:35 +010092 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010093 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +020094 ...) {
95 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
96 word <- frequency <- NULL
97
98 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
99 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
100 }
101
Marc Kupietz581a29b2021-09-04 20:51:04 +0200102 warnIfNoAccessToken(kco)
103
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200104 if (lemmatizeNodeQuery) {
105 node <- lemmatizeWordQuery(node)
106 }
107
Marc Kupietz5a336b62021-11-27 17:51:35 +0100108 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200109 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
110 purrr::pmap(grid, function(node, vc, ...)
111 collocationAnalysis(kco,
112 node =node,
113 vc = vc,
114 minOccur = minOccur,
115 leftContextSize = leftContextSize,
116 rightContextSize = rightContextSize,
117 topCollocatesLimit = topCollocatesLimit,
118 searchHitsSampleLimit = searchHitsSampleLimit,
119 ignoreCollocateCase = ignoreCollocateCase,
120 withinSpan = withinSpan,
121 exactFrequencies = exactFrequencies,
122 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100123 addExamples = TRUE,
124 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200125 seed = seed,
126 expand = expand,
127 ...) ) %>%
128 bind_rows()
129 } else {
130 set.seed(seed)
131 candidates <- collocatesQuery(
132 kco,
133 node,
134 vc = vc,
135 minOccur = minOccur,
136 leftContextSize = leftContextSize,
137 rightContextSize = rightContextSize,
138 searchHitsSampleLimit = searchHitsSampleLimit,
139 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100140 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200141 ...
142 )
143
144 if (nrow(candidates) > 0) {
145 candidates <- candidates %>%
146 filter(frequency >= minOccur) %>%
Marc Kupietz23004c62022-09-06 10:55:28 +0200147 slice_head(n=topCollocatesLimit)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200148 collocationScoreQuery(
149 kco,
150 node = node,
151 collocate = candidates$word,
152 vc = vc,
153 leftContextSize = leftContextSize,
154 rightContextSize = rightContextSize,
155 observed = if (exactFrequencies) NA else candidates$frequency,
156 ignoreCollocateCase = ignoreCollocateCase,
157 withinSpan = withinSpan,
158 ...
159 ) %>%
160 filter(.$O >= minOccur) %>%
161 dplyr::arrange(dplyr::desc(logDice))
162 } else {
163 tibble()
164 }
165 }
Marc Kupietz1fe94102024-04-29 17:23:45 +0200166 if (maxRecurse > 0 & length(result) > 0 && any(!!thresholdScore >= threshold)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100167 recurseWith <- result %>%
Marc Kupietz419f21f2021-12-07 10:27:30 +0100168 filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100169 result <- collocationAnalysis(
170 kco,
171 node = paste0("(", buildCollocationQuery(
172 removeWithinSpan(recurseWith$node, withinSpan),
173 recurseWith$collocate,
174 leftContextSize = leftContextSize,
175 rightContextSize = rightContextSize,
176 withinSpan = ""
177 ), ")"),
178 vc = vc,
179 minOccur = minOccur,
180 leftContextSize = leftContextSize,
181 rightContextSize = rightContextSize,
182 withinSpan = withinSpan,
183 maxRecurse = maxRecurse - 1,
184 stopwords = stopwords,
185 localStopwords = recurseWith$collocate,
186 exactFrequencies = exactFrequencies,
187 searchHitsSampleLimit = searchHitsSampleLimit,
188 topCollocatesLimit = topCollocatesLimit,
189 addExamples = FALSE
190 ) %>%
191 bind_rows(result) %>%
192 filter(logDice >= 2) %>%
193 filter(.$O >= minOccur) %>%
194 dplyr::arrange(dplyr::desc(logDice))
195 }
196 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100197 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100198 result$node,
199 result$collocate,
200 leftContextSize = leftContextSize,
201 rightContextSize = rightContextSize,
Marc Kupietz90189652023-04-18 08:01:37 +0200202 withinSpan = withinSpan
Marc Kupietz5a336b62021-11-27 17:51:35 +0100203 )
204 result$example <- findExample(
205 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100206 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100207 vc = result$vc
208 )
209 }
210 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200211 }
212)
213
Marc Kupietz76b05592021-12-19 16:26:15 +0100214# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100215removeWithinSpan <- function(query, withinSpan) {
216 if (withinSpan == "") {
217 return(query)
218 }
219 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
220 res <- gsub(needle, '\\1', query)
221 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
222 res <- gsub(needle, '\\1', res)
223 return(res)
224}
225
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200226#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200227#' @importFrom stringr str_detect
228#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
229#'
230matches2FreqTable <- function(matches,
231 index = 0,
232 minOccur = 5,
233 leftContextSize = 5,
234 rightContextSize = 5,
235 ignoreCollocateCase = FALSE,
236 stopwords = c(),
237 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
238 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
239 verbose = TRUE) {
240 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
241 frequency <- NULL
242
243 if (nrow(matches) < 1) {
244 dplyr::tibble(word=c(), frequency=c())
245 } else if (index == 0) {
246 if (! "tokens" %in% colnames(matches) || ! is.list(matches$tokens)) {
247 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
248 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize, ignoreCollocateCase = ignoreCollocateCase,
249 stopwords = stopwords, oldTable = oldTable, verbose = verbose))
250
251 }
252 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
253 for (i in 1:nrow(matches)) {
254 oldTable <- matches2FreqTable(
255 matches,
256 i,
257 leftContextSize = leftContextSize,
258 rightContextSize = rightContextSize,
259 collocateFilterRegex = collocateFilterRegex,
260 oldTable = oldTable,
261 stopwords = stopwords
262 )
263 }
264 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
265 oldTable %>%
266 group_by(word) %>%
267 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
268 summarise(frequency=sum(frequency), .groups = "drop") %>%
269 arrange(desc(frequency))
270 } else {
271 stopwordsTable <- dplyr::tibble(word=stopwords)
272
273 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
274
275# cat(paste("left:", left, "\n", collapse=" "))
276
277 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
278
279# cat(paste("right:", right, "\n", collapse=" "))
280
281 if(length(left) + length(right) == 0) {
282 oldTable
283 } else {
284 table(c(left, right)) %>%
285 dplyr::as_tibble(.name_repair = "minimal") %>%
286 dplyr::rename(word = 1, frequency = 2) %>%
287 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
288 dplyr::anti_join(stopwordsTable, by="word") %>%
289 dplyr::bind_rows(oldTable)
290 }
291 }
292}
293
294#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200295#' @importFrom stringr str_match str_split str_detect
296#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
297#'
298snippet2FreqTable <- function(snippet,
299 minOccur = 5,
300 leftContextSize = 5,
301 rightContextSize = 5,
302 ignoreCollocateCase = FALSE,
303 stopwords = c(),
304 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100305 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200306 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
307 verbose = TRUE) {
308 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
309 frequency <- NULL
310
311 if (length(snippet) < 1) {
312 dplyr::tibble(word=c(), frequency=c())
313 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200314 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200315 for (s in snippet) {
316 oldTable <- snippet2FreqTable(
317 s,
318 leftContextSize = leftContextSize,
319 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100320 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200321 oldTable = oldTable,
322 stopwords = stopwords
323 )
324 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200325 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200326 oldTable %>%
327 group_by(word) %>%
328 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
329 summarise(frequency=sum(frequency), .groups = "drop") %>%
330 arrange(desc(frequency))
331 } else {
332 stopwordsTable <- dplyr::tibble(word=stopwords)
333 match <-
334 str_match(
335 snippet,
336 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
337 )
338
339 left <- if(leftContextSize > 0)
340 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
341 else
342 ""
343# cat(paste("left:", left, "\n", collapse=" "))
344
345 right <- if(rightContextSize > 0)
346 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
347 else
348 ""
349# cat(paste("right:", right, "\n", collapse=" "))
350
Marc Kupietz21134402023-05-09 17:57:23 +0200351 if(is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200352 oldTable
353 } else {
354 table(c(left, right)) %>%
355 dplyr::as_tibble(.name_repair = "minimal") %>%
356 dplyr::rename(word = 1, frequency = 2) %>%
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100357 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200358 dplyr::anti_join(stopwordsTable, by="word") %>%
359 dplyr::bind_rows(oldTable)
360 }
361 }
362}
363
364#' Preliminary synsemantic stopwords function
365#'
366#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200367#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200368#'
369#' Preliminary synsemantic stopwords function to be used in collocation analysis.
370#'
371#' @details
372#' Currently only suitable for German. See stopwords package for other languages.
373#'
374#' @param ... future arguments for language detection
375#'
376#' @family collocation analysis functions
377#' @return Vector of synsemantic stopwords.
378#' @export
379synsemanticStopwords <- function(...) {
380 res <- c(
381 "der",
382 "die",
383 "und",
384 "in",
385 "den",
386 "von",
387 "mit",
388 "das",
389 "zu",
390 "im",
391 "ist",
392 "auf",
393 "sich",
394 "Die",
395 "des",
396 "dem",
397 "nicht",
398 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100399 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200400 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100401 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200402 "es",
403 "auch",
404 "an",
405 "als",
406 "am",
407 "aus",
408 "Der",
409 "bei",
410 "er",
411 "dass",
412 "sie",
413 "nach",
414 "um",
415 "Das",
416 "zum",
417 "noch",
418 "war",
419 "einen",
420 "einer",
421 "wie",
422 "einem",
423 "vor",
424 "bis",
425 "\u00fcber",
426 "so",
427 "aber",
428 "Eine",
429 "diese",
430 "Diese",
431 "oder"
432 )
433 return(res)
434}
435
Marc Kupietz5a336b62021-11-27 17:51:35 +0100436
Marc Kupietz76b05592021-12-19 16:26:15 +0100437# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100438findExample <-
439 function(kco,
440 query,
441 vc = "",
442 matchOnly = TRUE) {
443 out <- character(length = length(query))
444
445 if (length(vc) < length(query))
446 vc <- rep(vc, length(query))
447
448 for (i in seq_along(query)) {
449 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100450 if (q@totalResults > 0) {
451 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
452 example <- as.character((q@collectedMatches)$snippet[1])
453 out[i] <- if(matchOnly) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100454 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
455 } else {
456 stringr::str_replace(example, '<[^>]*>', '')
457 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100458 } else {
459 out[i] = ""
460 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100461 }
462 out
463 }
464
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200465collocatesQuery <-
466 function(kco,
467 query,
468 vc = "",
469 minOccur = 5,
470 leftContextSize = 5,
471 rightContextSize = 5,
472 searchHitsSampleLimit = 20000,
473 ignoreCollocateCase = FALSE,
474 stopwords = c(),
475 ...) {
476 frequency <- NULL
477 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
478 if(q@totalResults == 0) {
479 tibble(word=c(), frequency=c())
480 } else {
481 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200482 matches2FreqTable (q@collectedMatches,
483 0,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200484 minOccur = minOccur,
485 leftContextSize = leftContextSize,
486 rightContextSize = rightContextSize,
487 ignoreCollocateCase = ignoreCollocateCase,
488 stopwords = stopwords,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100489 ...,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200490 verbose = kco@verbose) %>%
491 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
492 filter(frequency >= minOccur)
493 }
494 }