blob: 6033ca93e0fefebd0efbb2a46441d7f56630a21b [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020038#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
45#'
46#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020047#' \dontrun{
48#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' # Find top collocates of "Packung" inside and outside the sports domain.
50#' new("KorAPConnection", verbose = TRUE) %>%
51#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
52#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
53#' dplyr::filter(logDice >= 5)
54#' }
55#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020056#' \dontrun{
57#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020058#' # Identify the most prominent light verb construction with "in ... setzen".
59#' # Note that, currently, the use of focus function disallows exactFrequencies.
60#' new("KorAPConnection", verbose = TRUE) %>%
61#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
62#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
63#' }
64#'
65#' @export
66setMethod("collocationAnalysis", "KorAPConnection",
67 function(kco,
68 node,
69 vc = "",
70 lemmatizeNodeQuery = FALSE,
71 minOccur = 5,
72 leftContextSize = 5,
73 rightContextSize = 5,
74 topCollocatesLimit = 200,
75 searchHitsSampleLimit = 20000,
76 ignoreCollocateCase = FALSE,
77 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
78 exactFrequencies = TRUE,
79 stopwords = RKorAPClient::synsemanticStopwords(),
80 seed = 7,
81 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010082 maxRecurse = 0,
83 addExamples = TRUE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010084 thresholdScore = "logDice",
85 threshold = 2.0,
Marc Kupietz5a336b62021-11-27 17:51:35 +010086 localStopwords = c(),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020087 ...) {
88 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
89 word <- frequency <- NULL
90
91 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
92 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
93 }
94
Marc Kupietz581a29b2021-09-04 20:51:04 +020095 warnIfNoAccessToken(kco)
96
Marc Kupietzdbd431a2021-08-29 12:17:45 +020097 if (lemmatizeNodeQuery) {
98 node <- lemmatizeWordQuery(node)
99 }
100
Marc Kupietz5a336b62021-11-27 17:51:35 +0100101 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200102 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
103 purrr::pmap(grid, function(node, vc, ...)
104 collocationAnalysis(kco,
105 node =node,
106 vc = vc,
107 minOccur = minOccur,
108 leftContextSize = leftContextSize,
109 rightContextSize = rightContextSize,
110 topCollocatesLimit = topCollocatesLimit,
111 searchHitsSampleLimit = searchHitsSampleLimit,
112 ignoreCollocateCase = ignoreCollocateCase,
113 withinSpan = withinSpan,
114 exactFrequencies = exactFrequencies,
115 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100116 addExamples = TRUE,
117 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200118 seed = seed,
119 expand = expand,
120 ...) ) %>%
121 bind_rows()
122 } else {
123 set.seed(seed)
124 candidates <- collocatesQuery(
125 kco,
126 node,
127 vc = vc,
128 minOccur = minOccur,
129 leftContextSize = leftContextSize,
130 rightContextSize = rightContextSize,
131 searchHitsSampleLimit = searchHitsSampleLimit,
132 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100133 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200134 ...
135 )
136
137 if (nrow(candidates) > 0) {
138 candidates <- candidates %>%
139 filter(frequency >= minOccur) %>%
140 head(topCollocatesLimit)
141 collocationScoreQuery(
142 kco,
143 node = node,
144 collocate = candidates$word,
145 vc = vc,
146 leftContextSize = leftContextSize,
147 rightContextSize = rightContextSize,
148 observed = if (exactFrequencies) NA else candidates$frequency,
149 ignoreCollocateCase = ignoreCollocateCase,
150 withinSpan = withinSpan,
151 ...
152 ) %>%
153 filter(.$O >= minOccur) %>%
154 dplyr::arrange(dplyr::desc(logDice))
155 } else {
156 tibble()
157 }
158 }
Marc Kupietz419f21f2021-12-07 10:27:30 +0100159 if (maxRecurse > 0 & any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100160 recurseWith <- result %>%
Marc Kupietz419f21f2021-12-07 10:27:30 +0100161 filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100162 result <- collocationAnalysis(
163 kco,
164 node = paste0("(", buildCollocationQuery(
165 removeWithinSpan(recurseWith$node, withinSpan),
166 recurseWith$collocate,
167 leftContextSize = leftContextSize,
168 rightContextSize = rightContextSize,
169 withinSpan = ""
170 ), ")"),
171 vc = vc,
172 minOccur = minOccur,
173 leftContextSize = leftContextSize,
174 rightContextSize = rightContextSize,
175 withinSpan = withinSpan,
176 maxRecurse = maxRecurse - 1,
177 stopwords = stopwords,
178 localStopwords = recurseWith$collocate,
179 exactFrequencies = exactFrequencies,
180 searchHitsSampleLimit = searchHitsSampleLimit,
181 topCollocatesLimit = topCollocatesLimit,
182 addExamples = FALSE
183 ) %>%
184 bind_rows(result) %>%
185 filter(logDice >= 2) %>%
186 filter(.$O >= minOccur) %>%
187 dplyr::arrange(dplyr::desc(logDice))
188 }
189 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100190 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100191 result$node,
192 result$collocate,
193 leftContextSize = leftContextSize,
194 rightContextSize = rightContextSize,
195 withinSpan = ""
196 )
197 result$example <- findExample(
198 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100199 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100200 vc = result$vc
201 )
202 }
203 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200204 }
205)
206
Marc Kupietz5a336b62021-11-27 17:51:35 +0100207#' @export
208removeWithinSpan <- function(query, withinSpan) {
209 if (withinSpan == "") {
210 return(query)
211 }
212 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
213 res <- gsub(needle, '\\1', query)
214 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
215 res <- gsub(needle, '\\1', res)
216 return(res)
217}
218
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200219#' @importFrom magrittr debug_pipe
220#' @importFrom stringr str_match str_split str_detect
221#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
222#'
223snippet2FreqTable <- function(snippet,
224 minOccur = 5,
225 leftContextSize = 5,
226 rightContextSize = 5,
227 ignoreCollocateCase = FALSE,
228 stopwords = c(),
229 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
230 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
231 verbose = TRUE) {
232 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
233 frequency <- NULL
234
235 if (length(snippet) < 1) {
236 dplyr::tibble(word=c(), frequency=c())
237 } else if (length(snippet) > 1) {
Marc Kupietz8fe0bee2021-12-07 10:25:45 +0100238 log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200239 for (s in snippet) {
240 oldTable <- snippet2FreqTable(
241 s,
242 leftContextSize = leftContextSize,
243 rightContextSize = rightContextSize,
244 oldTable = oldTable,
245 stopwords = stopwords
246 )
247 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200248 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200249 oldTable %>%
250 group_by(word) %>%
251 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
252 summarise(frequency=sum(frequency), .groups = "drop") %>%
253 arrange(desc(frequency))
254 } else {
255 stopwordsTable <- dplyr::tibble(word=stopwords)
256 match <-
257 str_match(
258 snippet,
259 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
260 )
261
262 left <- if(leftContextSize > 0)
263 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
264 else
265 ""
266# cat(paste("left:", left, "\n", collapse=" "))
267
268 right <- if(rightContextSize > 0)
269 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
270 else
271 ""
272# cat(paste("right:", right, "\n", collapse=" "))
273
274 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
275 oldTable
276 } else {
277 table(c(left, right)) %>%
278 dplyr::as_tibble(.name_repair = "minimal") %>%
279 dplyr::rename(word = 1, frequency = 2) %>%
280 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
281 dplyr::anti_join(stopwordsTable, by="word") %>%
282 dplyr::bind_rows(oldTable)
283 }
284 }
285}
286
287#' Preliminary synsemantic stopwords function
288#'
289#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200290#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200291#'
292#' Preliminary synsemantic stopwords function to be used in collocation analysis.
293#'
294#' @details
295#' Currently only suitable for German. See stopwords package for other languages.
296#'
297#' @param ... future arguments for language detection
298#'
299#' @family collocation analysis functions
300#' @return Vector of synsemantic stopwords.
301#' @export
302synsemanticStopwords <- function(...) {
303 res <- c(
304 "der",
305 "die",
306 "und",
307 "in",
308 "den",
309 "von",
310 "mit",
311 "das",
312 "zu",
313 "im",
314 "ist",
315 "auf",
316 "sich",
317 "Die",
318 "des",
319 "dem",
320 "nicht",
321 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100322 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200323 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100324 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200325 "es",
326 "auch",
327 "an",
328 "als",
329 "am",
330 "aus",
331 "Der",
332 "bei",
333 "er",
334 "dass",
335 "sie",
336 "nach",
337 "um",
338 "Das",
339 "zum",
340 "noch",
341 "war",
342 "einen",
343 "einer",
344 "wie",
345 "einem",
346 "vor",
347 "bis",
348 "\u00fcber",
349 "so",
350 "aber",
351 "Eine",
352 "diese",
353 "Diese",
354 "oder"
355 )
356 return(res)
357}
358
Marc Kupietz5a336b62021-11-27 17:51:35 +0100359
360#' @export
361findExample <-
362 function(kco,
363 query,
364 vc = "",
365 matchOnly = TRUE) {
366 out <- character(length = length(query))
367
368 if (length(vc) < length(query))
369 vc <- rep(vc, length(query))
370
371 for (i in seq_along(query)) {
372 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100373 if (q@totalResults > 0) {
374 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
375 example <- as.character((q@collectedMatches)$snippet[1])
376 out[i] <- if(matchOnly) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100377 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
378 } else {
379 stringr::str_replace(example, '<[^>]*>', '')
380 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100381 } else {
382 out[i] = ""
383 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100384 }
385 out
386 }
387
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200388collocatesQuery <-
389 function(kco,
390 query,
391 vc = "",
392 minOccur = 5,
393 leftContextSize = 5,
394 rightContextSize = 5,
395 searchHitsSampleLimit = 20000,
396 ignoreCollocateCase = FALSE,
397 stopwords = c(),
398 ...) {
399 frequency <- NULL
400 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
401 if(q@totalResults == 0) {
402 tibble(word=c(), frequency=c())
403 } else {
404 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
405 snippet2FreqTable((q@collectedMatches)$snippet,
406 minOccur = minOccur,
407 leftContextSize = leftContextSize,
408 rightContextSize = rightContextSize,
409 ignoreCollocateCase = ignoreCollocateCase,
410 stopwords = stopwords,
411 verbose = kco@verbose) %>%
412 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
413 filter(frequency >= minOccur)
414 }
415 }