blob: fc9d81472f3a6535ce1afcc601e25c4ad0174132 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz7d400e02021-12-19 16:39:36 +010037#' @param maxRecurse apply collocation analysis recursively `maxRecurse` times
38#' @param addExamples If TRUE, examples for instances of collocations will be added in a column `example`. This makes a difference in particular if `node` is given as a lemma query.
39#' @param thresholdScore association score function (see \code{\link{association-score-functions}}) to use for computing the threshold that is applied for recursive collocation analysis calls
40#' @param threshold minimum value of `thresholdScore` function call to apply collocation analysis recursively
41#' @param localStopwords vector of stopwords that will not be considered as collocates in the current function call, but that will not be passed to recursive calls
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010042#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020043#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020044#' @inheritParams collocationScoreQuery,KorAPConnection-method
45#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
46#'
47#' @importFrom stringr str_match str_split str_detect
48#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
49#' @importFrom purrr pmap
50#' @importFrom tidyr expand_grid
51#'
52#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020053#' \dontrun{
54#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020055#' # Find top collocates of "Packung" inside and outside the sports domain.
56#' new("KorAPConnection", verbose = TRUE) %>%
57#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
58#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
59#' dplyr::filter(logDice >= 5)
60#' }
61#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020062#' \dontrun{
63#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020064#' # Identify the most prominent light verb construction with "in ... setzen".
65#' # Note that, currently, the use of focus function disallows exactFrequencies.
66#' new("KorAPConnection", verbose = TRUE) %>%
67#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
68#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
69#' }
70#'
71#' @export
72setMethod("collocationAnalysis", "KorAPConnection",
73 function(kco,
74 node,
75 vc = "",
76 lemmatizeNodeQuery = FALSE,
77 minOccur = 5,
78 leftContextSize = 5,
79 rightContextSize = 5,
80 topCollocatesLimit = 200,
81 searchHitsSampleLimit = 20000,
82 ignoreCollocateCase = FALSE,
83 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
84 exactFrequencies = TRUE,
Marc Kupietz6505ccf2021-11-27 17:46:25 +010085 stopwords = append(RKorAPClient::synsemanticStopwords(), node),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020086 seed = 7,
87 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010088 maxRecurse = 0,
Marc Kupietzdadfd912021-12-22 12:48:20 +010089 addExamples = FALSE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010090 thresholdScore = "logDice",
91 threshold = 2.0,
Marc Kupietz5a336b62021-11-27 17:51:35 +010092 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010093 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +020094 ...) {
95 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
96 word <- frequency <- NULL
97
98 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
99 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
100 }
101
Marc Kupietz581a29b2021-09-04 20:51:04 +0200102 warnIfNoAccessToken(kco)
103
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200104 if (lemmatizeNodeQuery) {
105 node <- lemmatizeWordQuery(node)
106 }
107
Marc Kupietz5a336b62021-11-27 17:51:35 +0100108 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200109 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
110 purrr::pmap(grid, function(node, vc, ...)
111 collocationAnalysis(kco,
112 node =node,
113 vc = vc,
114 minOccur = minOccur,
115 leftContextSize = leftContextSize,
116 rightContextSize = rightContextSize,
117 topCollocatesLimit = topCollocatesLimit,
118 searchHitsSampleLimit = searchHitsSampleLimit,
119 ignoreCollocateCase = ignoreCollocateCase,
120 withinSpan = withinSpan,
121 exactFrequencies = exactFrequencies,
122 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100123 addExamples = TRUE,
124 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200125 seed = seed,
126 expand = expand,
127 ...) ) %>%
128 bind_rows()
129 } else {
130 set.seed(seed)
131 candidates <- collocatesQuery(
132 kco,
133 node,
134 vc = vc,
135 minOccur = minOccur,
136 leftContextSize = leftContextSize,
137 rightContextSize = rightContextSize,
138 searchHitsSampleLimit = searchHitsSampleLimit,
139 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100140 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200141 ...
142 )
143
144 if (nrow(candidates) > 0) {
145 candidates <- candidates %>%
146 filter(frequency >= minOccur) %>%
Marc Kupietz23004c62022-09-06 10:55:28 +0200147 slice_head(n=topCollocatesLimit)
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200148 collocationScoreQuery(
149 kco,
150 node = node,
151 collocate = candidates$word,
152 vc = vc,
153 leftContextSize = leftContextSize,
154 rightContextSize = rightContextSize,
155 observed = if (exactFrequencies) NA else candidates$frequency,
156 ignoreCollocateCase = ignoreCollocateCase,
157 withinSpan = withinSpan,
158 ...
159 ) %>%
160 filter(.$O >= minOccur) %>%
161 dplyr::arrange(dplyr::desc(logDice))
162 } else {
163 tibble()
164 }
165 }
Marc Kupietzbdb95272021-12-22 17:42:21 +0100166 if (maxRecurse > 0 & length(result) > 0 && any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100167 recurseWith <- result %>%
Marc Kupietz419f21f2021-12-07 10:27:30 +0100168 filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100169 result <- collocationAnalysis(
170 kco,
171 node = paste0("(", buildCollocationQuery(
172 removeWithinSpan(recurseWith$node, withinSpan),
173 recurseWith$collocate,
174 leftContextSize = leftContextSize,
175 rightContextSize = rightContextSize,
176 withinSpan = ""
177 ), ")"),
178 vc = vc,
179 minOccur = minOccur,
180 leftContextSize = leftContextSize,
181 rightContextSize = rightContextSize,
182 withinSpan = withinSpan,
183 maxRecurse = maxRecurse - 1,
184 stopwords = stopwords,
185 localStopwords = recurseWith$collocate,
186 exactFrequencies = exactFrequencies,
187 searchHitsSampleLimit = searchHitsSampleLimit,
188 topCollocatesLimit = topCollocatesLimit,
189 addExamples = FALSE
190 ) %>%
191 bind_rows(result) %>%
192 filter(logDice >= 2) %>%
193 filter(.$O >= minOccur) %>%
194 dplyr::arrange(dplyr::desc(logDice))
195 }
196 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100197 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100198 result$node,
199 result$collocate,
200 leftContextSize = leftContextSize,
201 rightContextSize = rightContextSize,
Marc Kupietz90189652023-04-18 08:01:37 +0200202 withinSpan = withinSpan
Marc Kupietz5a336b62021-11-27 17:51:35 +0100203 )
204 result$example <- findExample(
205 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100206 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100207 vc = result$vc
208 )
209 }
210 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200211 }
212)
213
Marc Kupietz76b05592021-12-19 16:26:15 +0100214# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100215removeWithinSpan <- function(query, withinSpan) {
216 if (withinSpan == "") {
217 return(query)
218 }
219 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
220 res <- gsub(needle, '\\1', query)
221 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
222 res <- gsub(needle, '\\1', res)
223 return(res)
224}
225
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200226#' @importFrom magrittr debug_pipe
227#' @importFrom stringr str_match str_split str_detect
228#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
229#'
230snippet2FreqTable <- function(snippet,
231 minOccur = 5,
232 leftContextSize = 5,
233 rightContextSize = 5,
234 ignoreCollocateCase = FALSE,
235 stopwords = c(),
236 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100237 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200238 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
239 verbose = TRUE) {
240 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
241 frequency <- NULL
242
243 if (length(snippet) < 1) {
244 dplyr::tibble(word=c(), frequency=c())
245 } else if (length(snippet) > 1) {
Marc Kupietza47d1502023-04-18 15:26:47 +0200246 log_info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200247 for (s in snippet) {
248 oldTable <- snippet2FreqTable(
249 s,
250 leftContextSize = leftContextSize,
251 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100252 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200253 oldTable = oldTable,
254 stopwords = stopwords
255 )
256 }
Marc Kupietza47d1502023-04-18 15:26:47 +0200257 log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200258 oldTable %>%
259 group_by(word) %>%
260 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
261 summarise(frequency=sum(frequency), .groups = "drop") %>%
262 arrange(desc(frequency))
263 } else {
264 stopwordsTable <- dplyr::tibble(word=stopwords)
265 match <-
266 str_match(
267 snippet,
268 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
269 )
270
271 left <- if(leftContextSize > 0)
272 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
273 else
274 ""
275# cat(paste("left:", left, "\n", collapse=" "))
276
277 right <- if(rightContextSize > 0)
278 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
279 else
280 ""
281# cat(paste("right:", right, "\n", collapse=" "))
282
Marc Kupietz21134402023-05-09 17:57:23 +0200283 if(is.na(left[1]) || is.na(right[1]) || length(left) + length(right) == 0) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200284 oldTable
285 } else {
286 table(c(left, right)) %>%
287 dplyr::as_tibble(.name_repair = "minimal") %>%
288 dplyr::rename(word = 1, frequency = 2) %>%
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100289 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200290 dplyr::anti_join(stopwordsTable, by="word") %>%
291 dplyr::bind_rows(oldTable)
292 }
293 }
294}
295
296#' Preliminary synsemantic stopwords function
297#'
298#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200299#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200300#'
301#' Preliminary synsemantic stopwords function to be used in collocation analysis.
302#'
303#' @details
304#' Currently only suitable for German. See stopwords package for other languages.
305#'
306#' @param ... future arguments for language detection
307#'
308#' @family collocation analysis functions
309#' @return Vector of synsemantic stopwords.
310#' @export
311synsemanticStopwords <- function(...) {
312 res <- c(
313 "der",
314 "die",
315 "und",
316 "in",
317 "den",
318 "von",
319 "mit",
320 "das",
321 "zu",
322 "im",
323 "ist",
324 "auf",
325 "sich",
326 "Die",
327 "des",
328 "dem",
329 "nicht",
330 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100331 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200332 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100333 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200334 "es",
335 "auch",
336 "an",
337 "als",
338 "am",
339 "aus",
340 "Der",
341 "bei",
342 "er",
343 "dass",
344 "sie",
345 "nach",
346 "um",
347 "Das",
348 "zum",
349 "noch",
350 "war",
351 "einen",
352 "einer",
353 "wie",
354 "einem",
355 "vor",
356 "bis",
357 "\u00fcber",
358 "so",
359 "aber",
360 "Eine",
361 "diese",
362 "Diese",
363 "oder"
364 )
365 return(res)
366}
367
Marc Kupietz5a336b62021-11-27 17:51:35 +0100368
Marc Kupietz76b05592021-12-19 16:26:15 +0100369# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100370findExample <-
371 function(kco,
372 query,
373 vc = "",
374 matchOnly = TRUE) {
375 out <- character(length = length(query))
376
377 if (length(vc) < length(query))
378 vc <- rep(vc, length(query))
379
380 for (i in seq_along(query)) {
381 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100382 if (q@totalResults > 0) {
383 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
384 example <- as.character((q@collectedMatches)$snippet[1])
385 out[i] <- if(matchOnly) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100386 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
387 } else {
388 stringr::str_replace(example, '<[^>]*>', '')
389 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100390 } else {
391 out[i] = ""
392 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100393 }
394 out
395 }
396
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200397collocatesQuery <-
398 function(kco,
399 query,
400 vc = "",
401 minOccur = 5,
402 leftContextSize = 5,
403 rightContextSize = 5,
404 searchHitsSampleLimit = 20000,
405 ignoreCollocateCase = FALSE,
406 stopwords = c(),
407 ...) {
408 frequency <- NULL
409 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
410 if(q@totalResults == 0) {
411 tibble(word=c(), frequency=c())
412 } else {
413 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
414 snippet2FreqTable((q@collectedMatches)$snippet,
415 minOccur = minOccur,
416 leftContextSize = leftContextSize,
417 rightContextSize = rightContextSize,
418 ignoreCollocateCase = ignoreCollocateCase,
419 stopwords = stopwords,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100420 ...,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200421 verbose = kco@verbose) %>%
422 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
423 filter(frequency >= minOccur)
424 }
425 }