blob: 5e3f7ddcb60c26b38d1ccf8d9f8f165be8ec6499 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010037#' @param collocateFilterRegex allow only collocates matching the regular expression
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020039#' @inheritParams collocationScoreQuery,KorAPConnection-method
40#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
41#'
42#' @importFrom stringr str_match str_split str_detect
43#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
44#' @importFrom purrr pmap
45#' @importFrom tidyr expand_grid
46#'
47#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020048#' \dontrun{
49#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020050#' # Find top collocates of "Packung" inside and outside the sports domain.
51#' new("KorAPConnection", verbose = TRUE) %>%
52#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
53#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
54#' dplyr::filter(logDice >= 5)
55#' }
56#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020057#' \dontrun{
58#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020059#' # Identify the most prominent light verb construction with "in ... setzen".
60#' # Note that, currently, the use of focus function disallows exactFrequencies.
61#' new("KorAPConnection", verbose = TRUE) %>%
62#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
63#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
64#' }
65#'
66#' @export
67setMethod("collocationAnalysis", "KorAPConnection",
68 function(kco,
69 node,
70 vc = "",
71 lemmatizeNodeQuery = FALSE,
72 minOccur = 5,
73 leftContextSize = 5,
74 rightContextSize = 5,
75 topCollocatesLimit = 200,
76 searchHitsSampleLimit = 20000,
77 ignoreCollocateCase = FALSE,
78 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
79 exactFrequencies = TRUE,
80 stopwords = RKorAPClient::synsemanticStopwords(),
81 seed = 7,
82 expand = length(vc) != length(node),
Marc Kupietz5a336b62021-11-27 17:51:35 +010083 maxRecurse = 0,
84 addExamples = TRUE,
Marc Kupietz419f21f2021-12-07 10:27:30 +010085 thresholdScore = "logDice",
86 threshold = 2.0,
Marc Kupietz5a336b62021-11-27 17:51:35 +010087 localStopwords = c(),
Marc Kupietz47d0d2b2021-12-19 16:38:52 +010088 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +020089 ...) {
90 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
91 word <- frequency <- NULL
92
93 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
94 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
95 }
96
Marc Kupietz581a29b2021-09-04 20:51:04 +020097 warnIfNoAccessToken(kco)
98
Marc Kupietzdbd431a2021-08-29 12:17:45 +020099 if (lemmatizeNodeQuery) {
100 node <- lemmatizeWordQuery(node)
101 }
102
Marc Kupietz5a336b62021-11-27 17:51:35 +0100103 result <- if (length(node) > 1 || length(vc) > 1) {
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200104 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
105 purrr::pmap(grid, function(node, vc, ...)
106 collocationAnalysis(kco,
107 node =node,
108 vc = vc,
109 minOccur = minOccur,
110 leftContextSize = leftContextSize,
111 rightContextSize = rightContextSize,
112 topCollocatesLimit = topCollocatesLimit,
113 searchHitsSampleLimit = searchHitsSampleLimit,
114 ignoreCollocateCase = ignoreCollocateCase,
115 withinSpan = withinSpan,
116 exactFrequencies = exactFrequencies,
117 stopwords = stopwords,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100118 addExamples = TRUE,
119 localStopwords = localStopwords,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200120 seed = seed,
121 expand = expand,
122 ...) ) %>%
123 bind_rows()
124 } else {
125 set.seed(seed)
126 candidates <- collocatesQuery(
127 kco,
128 node,
129 vc = vc,
130 minOccur = minOccur,
131 leftContextSize = leftContextSize,
132 rightContextSize = rightContextSize,
133 searchHitsSampleLimit = searchHitsSampleLimit,
134 ignoreCollocateCase = ignoreCollocateCase,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100135 stopwords = append(stopwords, localStopwords),
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200136 ...
137 )
138
139 if (nrow(candidates) > 0) {
140 candidates <- candidates %>%
141 filter(frequency >= minOccur) %>%
142 head(topCollocatesLimit)
143 collocationScoreQuery(
144 kco,
145 node = node,
146 collocate = candidates$word,
147 vc = vc,
148 leftContextSize = leftContextSize,
149 rightContextSize = rightContextSize,
150 observed = if (exactFrequencies) NA else candidates$frequency,
151 ignoreCollocateCase = ignoreCollocateCase,
152 withinSpan = withinSpan,
153 ...
154 ) %>%
155 filter(.$O >= minOccur) %>%
156 dplyr::arrange(dplyr::desc(logDice))
157 } else {
158 tibble()
159 }
160 }
Marc Kupietz419f21f2021-12-07 10:27:30 +0100161 if (maxRecurse > 0 & any(!!as.name(thresholdScore) >= threshold)) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100162 recurseWith <- result %>%
Marc Kupietz419f21f2021-12-07 10:27:30 +0100163 filter(!!as.name(thresholdScore) >= threshold)
Marc Kupietz5a336b62021-11-27 17:51:35 +0100164 result <- collocationAnalysis(
165 kco,
166 node = paste0("(", buildCollocationQuery(
167 removeWithinSpan(recurseWith$node, withinSpan),
168 recurseWith$collocate,
169 leftContextSize = leftContextSize,
170 rightContextSize = rightContextSize,
171 withinSpan = ""
172 ), ")"),
173 vc = vc,
174 minOccur = minOccur,
175 leftContextSize = leftContextSize,
176 rightContextSize = rightContextSize,
177 withinSpan = withinSpan,
178 maxRecurse = maxRecurse - 1,
179 stopwords = stopwords,
180 localStopwords = recurseWith$collocate,
181 exactFrequencies = exactFrequencies,
182 searchHitsSampleLimit = searchHitsSampleLimit,
183 topCollocatesLimit = topCollocatesLimit,
184 addExamples = FALSE
185 ) %>%
186 bind_rows(result) %>%
187 filter(logDice >= 2) %>%
188 filter(.$O >= minOccur) %>%
189 dplyr::arrange(dplyr::desc(logDice))
190 }
191 if (addExamples && length(result) > 0) {
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100192 result$query <-buildCollocationQuery(
Marc Kupietz5a336b62021-11-27 17:51:35 +0100193 result$node,
194 result$collocate,
195 leftContextSize = leftContextSize,
196 rightContextSize = rightContextSize,
197 withinSpan = ""
198 )
199 result$example <- findExample(
200 kco,
Marc Kupietz1678c3a2021-12-07 10:24:49 +0100201 query = result$query,
Marc Kupietz5a336b62021-11-27 17:51:35 +0100202 vc = result$vc
203 )
204 }
205 result
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200206 }
207)
208
Marc Kupietz76b05592021-12-19 16:26:15 +0100209# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100210removeWithinSpan <- function(query, withinSpan) {
211 if (withinSpan == "") {
212 return(query)
213 }
214 needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan)
215 res <- gsub(needle, '\\1', query)
216 needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan)
217 res <- gsub(needle, '\\1', res)
218 return(res)
219}
220
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200221#' @importFrom magrittr debug_pipe
222#' @importFrom stringr str_match str_split str_detect
223#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
224#'
225snippet2FreqTable <- function(snippet,
226 minOccur = 5,
227 leftContextSize = 5,
228 rightContextSize = 5,
229 ignoreCollocateCase = FALSE,
230 stopwords = c(),
231 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100232 collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200233 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
234 verbose = TRUE) {
235 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
236 frequency <- NULL
237
238 if (length(snippet) < 1) {
239 dplyr::tibble(word=c(), frequency=c())
240 } else if (length(snippet) > 1) {
Marc Kupietz8fe0bee2021-12-07 10:25:45 +0100241 log.info(verbose, paste("Joining", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200242 for (s in snippet) {
243 oldTable <- snippet2FreqTable(
244 s,
245 leftContextSize = leftContextSize,
246 rightContextSize = rightContextSize,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100247 collocateFilterRegex = collocateFilterRegex,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200248 oldTable = oldTable,
249 stopwords = stopwords
250 )
251 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200252 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200253 oldTable %>%
254 group_by(word) %>%
255 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
256 summarise(frequency=sum(frequency), .groups = "drop") %>%
257 arrange(desc(frequency))
258 } else {
259 stopwordsTable <- dplyr::tibble(word=stopwords)
260 match <-
261 str_match(
262 snippet,
263 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
264 )
265
266 left <- if(leftContextSize > 0)
267 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
268 else
269 ""
270# cat(paste("left:", left, "\n", collapse=" "))
271
272 right <- if(rightContextSize > 0)
273 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
274 else
275 ""
276# cat(paste("right:", right, "\n", collapse=" "))
277
278 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
279 oldTable
280 } else {
281 table(c(left, right)) %>%
282 dplyr::as_tibble(.name_repair = "minimal") %>%
283 dplyr::rename(word = 1, frequency = 2) %>%
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100284 dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200285 dplyr::anti_join(stopwordsTable, by="word") %>%
286 dplyr::bind_rows(oldTable)
287 }
288 }
289}
290
291#' Preliminary synsemantic stopwords function
292#'
293#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200294#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200295#'
296#' Preliminary synsemantic stopwords function to be used in collocation analysis.
297#'
298#' @details
299#' Currently only suitable for German. See stopwords package for other languages.
300#'
301#' @param ... future arguments for language detection
302#'
303#' @family collocation analysis functions
304#' @return Vector of synsemantic stopwords.
305#' @export
306synsemanticStopwords <- function(...) {
307 res <- c(
308 "der",
309 "die",
310 "und",
311 "in",
312 "den",
313 "von",
314 "mit",
315 "das",
316 "zu",
317 "im",
318 "ist",
319 "auf",
320 "sich",
321 "Die",
322 "des",
323 "dem",
324 "nicht",
325 "ein",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100326 "Ein",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200327 "eine",
Marc Kupietzd2c08cb2021-12-07 10:28:21 +0100328 "Eine",
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200329 "es",
330 "auch",
331 "an",
332 "als",
333 "am",
334 "aus",
335 "Der",
336 "bei",
337 "er",
338 "dass",
339 "sie",
340 "nach",
341 "um",
342 "Das",
343 "zum",
344 "noch",
345 "war",
346 "einen",
347 "einer",
348 "wie",
349 "einem",
350 "vor",
351 "bis",
352 "\u00fcber",
353 "so",
354 "aber",
355 "Eine",
356 "diese",
357 "Diese",
358 "oder"
359 )
360 return(res)
361}
362
Marc Kupietz5a336b62021-11-27 17:51:35 +0100363
Marc Kupietz76b05592021-12-19 16:26:15 +0100364# #' @export
Marc Kupietz5a336b62021-11-27 17:51:35 +0100365findExample <-
366 function(kco,
367 query,
368 vc = "",
369 matchOnly = TRUE) {
370 out <- character(length = length(query))
371
372 if (length(vc) < length(query))
373 vc <- rep(vc, length(query))
374
375 for (i in seq_along(query)) {
376 q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE)
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100377 if (q@totalResults > 0) {
378 q <- fetchNext(q, maxFetch=50, randomizePageOrder=F)
379 example <- as.character((q@collectedMatches)$snippet[1])
380 out[i] <- if(matchOnly) {
Marc Kupietz5a336b62021-11-27 17:51:35 +0100381 gsub('.*<mark>(.+)</mark>.*', '\\1', example)
382 } else {
383 stringr::str_replace(example, '<[^>]*>', '')
384 }
Marc Kupietzb811ffb2021-12-07 10:34:10 +0100385 } else {
386 out[i] = ""
387 }
Marc Kupietz5a336b62021-11-27 17:51:35 +0100388 }
389 out
390 }
391
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200392collocatesQuery <-
393 function(kco,
394 query,
395 vc = "",
396 minOccur = 5,
397 leftContextSize = 5,
398 rightContextSize = 5,
399 searchHitsSampleLimit = 20000,
400 ignoreCollocateCase = FALSE,
401 stopwords = c(),
402 ...) {
403 frequency <- NULL
404 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
405 if(q@totalResults == 0) {
406 tibble(word=c(), frequency=c())
407 } else {
408 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
409 snippet2FreqTable((q@collectedMatches)$snippet,
410 minOccur = minOccur,
411 leftContextSize = leftContextSize,
412 rightContextSize = rightContextSize,
413 ignoreCollocateCase = ignoreCollocateCase,
414 stopwords = stopwords,
Marc Kupietz47d0d2b2021-12-19 16:38:52 +0100415 ...,
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200416 verbose = kco@verbose) %>%
417 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
418 filter(frequency >= minOccur)
419 }
420 }