blob: 64e0c836060c1faf31db8aaa6d8dc98e9f8e6555 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010037#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
38#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
39#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
40#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
41#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010042#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020043#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020044#' @inheritParams collocationScoreQuery,KorAPConnection-method
45#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
46#'
Marc Kupietz2b17b212023-08-27 17:47:26 +020047#' @importFrom dplyr arrange desc slice_head bind_rows
Marc Kupietzdbd431a2021-08-29 12:17:45 +020048#' @importFrom purrr pmap
49#' @importFrom tidyr expand_grid
50#'
51#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020052#' \dontrun{
53#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020054#' # Find top collocates of "Packung" inside and outside the sports domain.
55#' new("KorAPConnection", verbose = TRUE) %>%
56#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
57#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
58#' dplyr::filter(logDice >= 5)
59#' }
60#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020061#' \dontrun{
62#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020063#' # Identify the most prominent light verb construction with "in ... setzen".
64#' # Note that, currently, the use of focus function disallows exactFrequencies.
65#' new("KorAPConnection", verbose = TRUE) %>%
66#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
67#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
68#' }
69#'
70#' @export
71setMethod("collocationAnalysis", "KorAPConnection",
72 function(kco,
73 node,
74 vc = "",
75 lemmatizeNodeQuery = FALSE,
76 minOccur = 5,
77 leftContextSize = 5,
78 rightContextSize = 5,
79 topCollocatesLimit = 200,
80 searchHitsSampleLimit = 20000,
81 ignoreCollocateCase = FALSE,
82 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
83 exactFrequencies = TRUE,
Marc Kupietz6505ccf2021-11-27 17:46:25 +010084 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020085 seed = 7,
86 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010087 maxRecurse = 0,
Marc Kupietzdadfd912021-12-22 12:48:20 +010088 addExamples = FALSE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010089 thresholdScore = "logDice",
90 threshold = 2.0,
Marc Kupietz5a336b62021-11-27 17:51:35 +010091 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010092 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +020093 ...) {
94 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
95 word <- frequency <- NULL
96
97 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
98 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
99 }
100
Marc Kupietz581a29b2021-09-04 20:51:04 +0200101 warnIfNoAccessToken(kco)
102
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200103 if (lemmatizeNodeQuery) {
104 node <- lemmatizeWordQuery(node)
105 }
106
Marc Kupietz5a336b62021-11-27 17:51:35 +0100107 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200108 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
109 purrr::pmap(grid, function(node, vc, ...)
110 collocationAnalysis(kco,
111 node =node,
112 vc = vc,
113 minOccur = minOccur,
114 leftContextSize = leftContextSize,
115 rightContextSize = rightContextSize,
116 topCollocatesLimit = topCollocatesLimit,
117 searchHitsSampleLimit = searchHitsSampleLimit,
118 ignoreCollocateCase = ignoreCollocateCase,
119 withinSpan = withinSpan,
120 exactFrequencies = exactFrequencies,
121 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100122 addExamples = TRUE,
123 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200124 seed = seed,
125 expand = expand,
126 ...) ) %>%
127 bind_rows()
128 } else {
129 set.seed(seed)
130 candidates <- collocatesQuery(
131 kco,
132 node,
133 vc = vc,
134 minOccur = minOccur,
135 leftContextSize = leftContextSize,
136 rightContextSize = rightContextSize,
137 searchHitsSampleLimit = searchHitsSampleLimit,
138 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100139 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200140 ...
141 )
142
143 if (nrow(candidates) > 0) {
144 candidates <- candidates %>%
145 filter(frequency >= minOccur) %>%
Marc Kupietz23004c62022-09-06 10:55:28 +0200146 slice_head(n=topCollocatesLimit)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200147 collocationScoreQuery(
148 kco,
149 node = node,
150 collocate = candidates$word,
151 vc = vc,
152 leftContextSize = leftContextSize,
153 rightContextSize = rightContextSize,
154 observed = if (exactFrequencies) NA else candidates$frequency,
155 ignoreCollocateCase = ignoreCollocateCase,
156 withinSpan = withinSpan,
157 ...
158 ) %>%
159 filter(.$O >= minOccur) %>%
160 dplyr::arrange(dplyr::desc(logDice))
161 } else {
162 tibble()
163 }
164 }
Marc Kupietzbdb95272021-12-22 17:42:21 +0100165 if (maxRecurse > 0 & length(result) > 0 && any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100166 recurseWith <- result %>%
Marc Kupietz419f21f2021-12-07 10:27:30 +0100167 filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100168 result <- collocationAnalysis(
169 kco,
170 node = paste0("(", buildCollocationQuery(
171 removeWithinSpan(recurseWith$node, withinSpan),
172 recurseWith$collocate,
173 leftContextSize = leftContextSize,
174 rightContextSize = rightContextSize,
175 withinSpan = ""
176 ), ")"),
177 vc = vc,
178 minOccur = minOccur,
179 leftContextSize = leftContextSize,
180 rightContextSize = rightContextSize,
181 withinSpan = withinSpan,
182 maxRecurse = maxRecurse - 1,
183 stopwords = stopwords,
184 localStopwords = recurseWith$collocate,
185 exactFrequencies = exactFrequencies,
186 searchHitsSampleLimit = searchHitsSampleLimit,
187 topCollocatesLimit = topCollocatesLimit,
188 addExamples = FALSE
189 ) %>%
190 bind_rows(result) %>%
191 filter(logDice >= 2) %>%
192 filter(.$O >= minOccur) %>%
193 dplyr::arrange(dplyr::desc(logDice))
194 }
195 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100196 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100197 result$node,
198 result$collocate,
199 leftContextSize = leftContextSize,
200 rightContextSize = rightContextSize,
Marc Kupietz90189652023-04-18 08:01:37 +0200201 withinSpan = withinSpan
Marc Kupietz5a336b62021-11-27 17:51:35 +0100202 )
203 result$example <- findExample(
204 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100205 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100206 vc = result$vc
207 )
208 }
209 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200210 }
211)
212
Marc Kupietz76b05592021-12-19 16:26:15 +0100213# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100214removeWithinSpan <- function(query, withinSpan) {
215 if (withinSpan == "") {
216 return(query)
217 }
218 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
219 res <- gsub(needle, '\\1', query)
220 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
221 res <- gsub(needle, '\\1', res)
222 return(res)
223}
224
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200225#' @importFrom magrittr debug_pipe
Marc Kupietz2b17b212023-08-27 17:47:26 +0200226#' @importFrom stringr str_detect
227#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
228#'
229matches2FreqTable <- function(matches,
230 index = 0,
231 minOccur = 5,
232 leftContextSize = 5,
233 rightContextSize = 5,
234 ignoreCollocateCase = FALSE,
235 stopwords = c(),
236 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
237 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
238 verbose = TRUE) {
239 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
240 frequency <- NULL
241
242 if (nrow(matches) < 1) {
243 dplyr::tibble(word=c(), frequency=c())
244 } else if (index == 0) {
245 if (! "tokens" %in% colnames(matches) || ! is.list(matches$tokens)) {
246 log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
247 return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize, ignoreCollocateCase = ignoreCollocateCase,
248 stopwords = stopwords, oldTable = oldTable, verbose = verbose))
249
250 }
251 log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
252 for (i in 1:nrow(matches)) {
253 oldTable <- matches2FreqTable(
254 matches,
255 i,
256 leftContextSize = leftContextSize,
257 rightContextSize = rightContextSize,
258 collocateFilterRegex = collocateFilterRegex,
259 oldTable = oldTable,
260 stopwords = stopwords
261 )
262 }
263 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
264 oldTable %>%
265 group_by(word) %>%
266 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
267 summarise(frequency=sum(frequency), .groups = "drop") %>%
268 arrange(desc(frequency))
269 } else {
270 stopwordsTable <- dplyr::tibble(word=stopwords)
271
272 left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
273
274# cat(paste("left:", left, "\n", collapse=" "))
275
276 right <- head(unlist(matches$tokens$right[index]), rightContextSize)
277
278# cat(paste("right:", right, "\n", collapse=" "))
279
280 if(length(left) + length(right) == 0) {
281 oldTable
282 } else {
283 table(c(left, right)) %>%
284 dplyr::as_tibble(.name_repair = "minimal") %>%
285 dplyr::rename(word = 1, frequency = 2) %>%
286 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
287 dplyr::anti_join(stopwordsTable, by="word") %>%
288 dplyr::bind_rows(oldTable)
289 }
290 }
291}
292
293#' @importFrom magrittr debug_pipe
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200294#' @importFrom stringr str_match str_split str_detect
295#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
296#'
297snippet2FreqTable <- function(snippet,
298 minOccur = 5,
299 leftContextSize = 5,
300 rightContextSize = 5,
301 ignoreCollocateCase = FALSE,
302 stopwords = c(),
303 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100304 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200305 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
306 verbose = TRUE) {
307 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
308 frequency <- NULL
309
310 if (length(snippet) < 1) {
311 dplyr::tibble(word=c(), frequency=c())
312 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200313 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200314 for (s in snippet) {
315 oldTable <- snippet2FreqTable(
316 s,
317 leftContextSize = leftContextSize,
318 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100319 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200320 oldTable = oldTable,
321 stopwords = stopwords
322 )
323 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200324 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200325 oldTable %>%
326 group_by(word) %>%
327 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
328 summarise(frequency=sum(frequency), .groups = "drop") %>%
329 arrange(desc(frequency))
330 } else {
331 stopwordsTable <- dplyr::tibble(word=stopwords)
332 match <-
333 str_match(
334 snippet,
335 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
336 )
337
338 left <- if(leftContextSize > 0)
339 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
340 else
341 ""
342# cat(paste("left:", left, "\n", collapse=" "))
343
344 right <- if(rightContextSize > 0)
345 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
346 else
347 ""
348# cat(paste("right:", right, "\n", collapse=" "))
349
Marc Kupietz21134402023-05-09 17:57:23 +0200350 if(is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200351 oldTable
352 } else {
353 table(c(left, right)) %>%
354 dplyr::as_tibble(.name_repair = "minimal") %>%
355 dplyr::rename(word = 1, frequency = 2) %>%
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100356 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200357 dplyr::anti_join(stopwordsTable, by="word") %>%
358 dplyr::bind_rows(oldTable)
359 }
360 }
361}
362
363#' Preliminary synsemantic stopwords function
364#'
365#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200366#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200367#'
368#' Preliminary synsemantic stopwords function to be used in collocation analysis.
369#'
370#' @details
371#' Currently only suitable for German. See stopwords package for other languages.
372#'
373#' @param ... future arguments for language detection
374#'
375#' @family collocation analysis functions
376#' @return Vector of synsemantic stopwords.
377#' @export
378synsemanticStopwords <- function(...) {
379 res <- c(
380 "der",
381 "die",
382 "und",
383 "in",
384 "den",
385 "von",
386 "mit",
387 "das",
388 "zu",
389 "im",
390 "ist",
391 "auf",
392 "sich",
393 "Die",
394 "des",
395 "dem",
396 "nicht",
397 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100398 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200399 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100400 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200401 "es",
402 "auch",
403 "an",
404 "als",
405 "am",
406 "aus",
407 "Der",
408 "bei",
409 "er",
410 "dass",
411 "sie",
412 "nach",
413 "um",
414 "Das",
415 "zum",
416 "noch",
417 "war",
418 "einen",
419 "einer",
420 "wie",
421 "einem",
422 "vor",
423 "bis",
424 "\u00fcber",
425 "so",
426 "aber",
427 "Eine",
428 "diese",
429 "Diese",
430 "oder"
431 )
432 return(res)
433}
434
Marc Kupietz5a336b62021-11-27 17:51:35 +0100435
Marc Kupietz76b05592021-12-19 16:26:15 +0100436# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100437findExample <-
438 function(kco,
439 query,
440 vc = "",
441 matchOnly = TRUE) {
442 out <- character(length = length(query))
443
444 if (length(vc) < length(query))
445 vc <- rep(vc, length(query))
446
447 for (i in seq_along(query)) {
448 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100449 if (q@totalResults > 0) {
450 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
451 example <- as.character((q@collectedMatches)$snippet[1])
452 out[i] <- if(matchOnly) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100453 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
454 } else {
455 stringr::str_replace(example, '<[^>]*>', '')
456 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100457 } else {
458 out[i] = ""
459 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100460 }
461 out
462 }
463
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200464collocatesQuery <-
465 function(kco,
466 query,
467 vc = "",
468 minOccur = 5,
469 leftContextSize = 5,
470 rightContextSize = 5,
471 searchHitsSampleLimit = 20000,
472 ignoreCollocateCase = FALSE,
473 stopwords = c(),
474 ...) {
475 frequency <- NULL
476 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
477 if(q@totalResults == 0) {
478 tibble(word=c(), frequency=c())
479 } else {
480 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
Marc Kupietz2b17b212023-08-27 17:47:26 +0200481 matches2FreqTable (q@collectedMatches,
482 0,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200483 minOccur = minOccur,
484 leftContextSize = leftContextSize,
485 rightContextSize = rightContextSize,
486 ignoreCollocateCase = ignoreCollocateCase,
487 stopwords = stopwords,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100488 ...,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200489 verbose = kco@verbose) %>%
490 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
491 filter(frequency >= minOccur)
492 }
493 }