blob: a10faf87a55f915505e02363a7c8f9c347b1cd22 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020038#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
45#'
46#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020047#' \dontrun{
48#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' # Find top collocates of "Packung" inside and outside the sports domain.
50#' new("KorAPConnection", verbose = TRUE) %>%
51#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
52#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
53#' dplyr::filter(logDice >= 5)
54#' }
55#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020056#' \dontrun{
57#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020058#' # Identify the most prominent light verb construction with "in ... setzen".
59#' # Note that, currently, the use of focus function disallows exactFrequencies.
60#' new("KorAPConnection", verbose = TRUE) %>%
61#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
62#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
63#' }
64#'
65#' @export
66setMethod("collocationAnalysis", "KorAPConnection",
67 function(kco,
68 node,
69 vc = "",
70 lemmatizeNodeQuery = FALSE,
71 minOccur = 5,
72 leftContextSize = 5,
73 rightContextSize = 5,
74 topCollocatesLimit = 200,
75 searchHitsSampleLimit = 20000,
76 ignoreCollocateCase = FALSE,
77 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
78 exactFrequencies = TRUE,
79 stopwords = RKorAPClient::synsemanticStopwords(),
80 seed = 7,
81 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010082 maxRecurse = 0,
83 addExamples = TRUE,
84 localStopwords = c(),
Marc Kupietzdbd431a2021-08-29 12:17:45 +020085 ...) {
86 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
87 word <- frequency <- NULL
88
89 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
90 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
91 }
92
Marc Kupietz581a29b2021-09-04 20:51:04 +020093 warnIfNoAccessToken(kco)
94
Marc Kupietzdbd431a2021-08-29 12:17:45 +020095 if (lemmatizeNodeQuery) {
96 node <- lemmatizeWordQuery(node)
97 }
98
Marc Kupietz5a336b62021-11-27 17:51:35 +010099 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200100 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
101 purrr::pmap(grid, function(node, vc, ...)
102 collocationAnalysis(kco,
103 node =node,
104 vc = vc,
105 minOccur = minOccur,
106 leftContextSize = leftContextSize,
107 rightContextSize = rightContextSize,
108 topCollocatesLimit = topCollocatesLimit,
109 searchHitsSampleLimit = searchHitsSampleLimit,
110 ignoreCollocateCase = ignoreCollocateCase,
111 withinSpan = withinSpan,
112 exactFrequencies = exactFrequencies,
113 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100114 addExamples = TRUE,
115 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200116 seed = seed,
117 expand = expand,
118 ...) ) %>%
119 bind_rows()
120 } else {
121 set.seed(seed)
122 candidates <- collocatesQuery(
123 kco,
124 node,
125 vc = vc,
126 minOccur = minOccur,
127 leftContextSize = leftContextSize,
128 rightContextSize = rightContextSize,
129 searchHitsSampleLimit = searchHitsSampleLimit,
130 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100131 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200132 ...
133 )
134
135 if (nrow(candidates) > 0) {
136 candidates <- candidates %>%
137 filter(frequency >= minOccur) %>%
138 head(topCollocatesLimit)
139 collocationScoreQuery(
140 kco,
141 node = node,
142 collocate = candidates$word,
143 vc = vc,
144 leftContextSize = leftContextSize,
145 rightContextSize = rightContextSize,
146 observed = if (exactFrequencies) NA else candidates$frequency,
147 ignoreCollocateCase = ignoreCollocateCase,
148 withinSpan = withinSpan,
149 ...
150 ) %>%
151 filter(.$O >= minOccur) %>%
152 dplyr::arrange(dplyr::desc(logDice))
153 } else {
154 tibble()
155 }
156 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100157 if (maxRecurse > 0 & any( result$logDice >= 2) ) {
158 recurseWith <- result %>%
159 filter(logDice >= 2)
160 result <- collocationAnalysis(
161 kco,
162 node = paste0("(", buildCollocationQuery(
163 removeWithinSpan(recurseWith$node, withinSpan),
164 recurseWith$collocate,
165 leftContextSize = leftContextSize,
166 rightContextSize = rightContextSize,
167 withinSpan = ""
168 ), ")"),
169 vc = vc,
170 minOccur = minOccur,
171 leftContextSize = leftContextSize,
172 rightContextSize = rightContextSize,
173 withinSpan = withinSpan,
174 maxRecurse = maxRecurse - 1,
175 stopwords = stopwords,
176 localStopwords = recurseWith$collocate,
177 exactFrequencies = exactFrequencies,
178 searchHitsSampleLimit = searchHitsSampleLimit,
179 topCollocatesLimit = topCollocatesLimit,
180 addExamples = FALSE
181 ) %>%
182 bind_rows(result) %>%
183 filter(logDice >= 2) %>%
184 filter(.$O >= minOccur) %>%
185 dplyr::arrange(dplyr::desc(logDice))
186 }
187 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100188 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100189 result$node,
190 result$collocate,
191 leftContextSize = leftContextSize,
192 rightContextSize = rightContextSize,
193 withinSpan = ""
194 )
195 result$example <- findExample(
196 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100197 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100198 vc = result$vc
199 )
200 }
201 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200202 }
203)
204
Marc Kupietz5a336b62021-11-27 17:51:35 +0100205#' @export
206removeWithinSpan <- function(query, withinSpan) {
207 if (withinSpan == "") {
208 return(query)
209 }
210 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
211 res <- gsub(needle, '\\1', query)
212 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
213 res <- gsub(needle, '\\1', res)
214 return(res)
215}
216
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200217#' @importFrom magrittr debug_pipe
218#' @importFrom stringr str_match str_split str_detect
219#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
220#'
221snippet2FreqTable <- function(snippet,
222 minOccur = 5,
223 leftContextSize = 5,
224 rightContextSize = 5,
225 ignoreCollocateCase = FALSE,
226 stopwords = c(),
227 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
228 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
229 verbose = TRUE) {
230 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
231 frequency <- NULL
232
233 if (length(snippet) < 1) {
234 dplyr::tibble(word=c(), frequency=c())
235 } else if (length(snippet) > 1) {
Marc Kupietz8fe0bee2021-12-07 10:25:45 +0100236 log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200237 for (s in snippet) {
238 oldTable <- snippet2FreqTable(
239 s,
240 leftContextSize = leftContextSize,
241 rightContextSize = rightContextSize,
242 oldTable = oldTable,
243 stopwords = stopwords
244 )
245 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200246 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200247 oldTable %>%
248 group_by(word) %>%
249 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
250 summarise(frequency=sum(frequency), .groups = "drop") %>%
251 arrange(desc(frequency))
252 } else {
253 stopwordsTable <- dplyr::tibble(word=stopwords)
254 match <-
255 str_match(
256 snippet,
257 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
258 )
259
260 left <- if(leftContextSize > 0)
261 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
262 else
263 ""
264# cat(paste("left:", left, "\n", collapse=" "))
265
266 right <- if(rightContextSize > 0)
267 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
268 else
269 ""
270# cat(paste("right:", right, "\n", collapse=" "))
271
272 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
273 oldTable
274 } else {
275 table(c(left, right)) %>%
276 dplyr::as_tibble(.name_repair = "minimal") %>%
277 dplyr::rename(word = 1, frequency = 2) %>%
278 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
279 dplyr::anti_join(stopwordsTable, by="word") %>%
280 dplyr::bind_rows(oldTable)
281 }
282 }
283}
284
285#' Preliminary synsemantic stopwords function
286#'
287#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200288#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200289#'
290#' Preliminary synsemantic stopwords function to be used in collocation analysis.
291#'
292#' @details
293#' Currently only suitable for German. See stopwords package for other languages.
294#'
295#' @param ... future arguments for language detection
296#'
297#' @family collocation analysis functions
298#' @return Vector of synsemantic stopwords.
299#' @export
300synsemanticStopwords <- function(...) {
301 res <- c(
302 "der",
303 "die",
304 "und",
305 "in",
306 "den",
307 "von",
308 "mit",
309 "das",
310 "zu",
311 "im",
312 "ist",
313 "auf",
314 "sich",
315 "Die",
316 "des",
317 "dem",
318 "nicht",
319 "ein",
320 "eine",
321 "es",
322 "auch",
323 "an",
324 "als",
325 "am",
326 "aus",
327 "Der",
328 "bei",
329 "er",
330 "dass",
331 "sie",
332 "nach",
333 "um",
334 "Das",
335 "zum",
336 "noch",
337 "war",
338 "einen",
339 "einer",
340 "wie",
341 "einem",
342 "vor",
343 "bis",
344 "\u00fcber",
345 "so",
346 "aber",
347 "Eine",
348 "diese",
349 "Diese",
350 "oder"
351 )
352 return(res)
353}
354
Marc Kupietz5a336b62021-11-27 17:51:35 +0100355
356#' @export
357findExample <-
358 function(kco,
359 query,
360 vc = "",
361 matchOnly = TRUE) {
362 out <- character(length = length(query))
363
364 if (length(vc) < length(query))
365 vc <- rep(vc, length(query))
366
367 for (i in seq_along(query)) {
368 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
369 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
370 example <- as.character((q@collectedMatches)$snippet[1])
371 out[i] <- if(matchOnly) {
372 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
373 } else {
374 stringr::str_replace(example, '<[^>]*>', '')
375 }
376 }
377 out
378 }
379
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200380collocatesQuery <-
381 function(kco,
382 query,
383 vc = "",
384 minOccur = 5,
385 leftContextSize = 5,
386 rightContextSize = 5,
387 searchHitsSampleLimit = 20000,
388 ignoreCollocateCase = FALSE,
389 stopwords = c(),
390 ...) {
391 frequency <- NULL
392 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
393 if(q@totalResults == 0) {
394 tibble(word=c(), frequency=c())
395 } else {
396 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
397 snippet2FreqTable((q@collectedMatches)$snippet,
398 minOccur = minOccur,
399 leftContextSize = leftContextSize,
400 rightContextSize = rightContextSize,
401 ignoreCollocateCase = ignoreCollocateCase,
402 stopwords = stopwords,
403 verbose = kco@verbose) %>%
404 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
405 filter(frequency >= minOccur)
406 }
407 }