blob: bf962f68ff91edd4a2748e4575ad8828fb822f3b [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020038#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
45#'
46#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020047#' \dontrun{
48#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020049#' # Find top collocates of "Packung" inside and outside the sports domain.
50#' new("KorAPConnection", verbose = TRUE) %>%
51#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
52#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
53#' dplyr::filter(logDice >= 5)
54#' }
55#'
Marc Kupietz6ae76052021-09-21 10:34:00 +020056#' \dontrun{
57#'
Marc Kupietzdbd431a2021-08-29 12:17:45 +020058#' # Identify the most prominent light verb construction with "in ... setzen".
59#' # Note that, currently, the use of focus function disallows exactFrequencies.
60#' new("KorAPConnection", verbose = TRUE) %>%
61#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
62#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
63#' }
64#'
65#' @export
66setMethod("collocationAnalysis", "KorAPConnection",
67 function(kco,
68 node,
69 vc = "",
70 lemmatizeNodeQuery = FALSE,
71 minOccur = 5,
72 leftContextSize = 5,
73 rightContextSize = 5,
74 topCollocatesLimit = 200,
75 searchHitsSampleLimit = 20000,
76 ignoreCollocateCase = FALSE,
77 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
78 exactFrequencies = TRUE,
79 stopwords = RKorAPClient::synsemanticStopwords(),
80 seed = 7,
81 expand = length(vc) != length(node),
82 ...) {
83 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
84 word <- frequency <- NULL
85
86 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
87 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
88 }
89
Marc Kupietz581a29b2021-09-04 20:51:04 +020090 warnIfNoAccessToken(kco)
91
Marc Kupietzdbd431a2021-08-29 12:17:45 +020092 if (lemmatizeNodeQuery) {
93 node <- lemmatizeWordQuery(node)
94 }
95
96 if (length(node) > 1 || length(vc) > 1) {
97 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
98 purrr::pmap(grid, function(node, vc, ...)
99 collocationAnalysis(kco,
100 node =node,
101 vc = vc,
102 minOccur = minOccur,
103 leftContextSize = leftContextSize,
104 rightContextSize = rightContextSize,
105 topCollocatesLimit = topCollocatesLimit,
106 searchHitsSampleLimit = searchHitsSampleLimit,
107 ignoreCollocateCase = ignoreCollocateCase,
108 withinSpan = withinSpan,
109 exactFrequencies = exactFrequencies,
110 stopwords = stopwords,
111 seed = seed,
112 expand = expand,
113 ...) ) %>%
114 bind_rows()
115 } else {
116 set.seed(seed)
117 candidates <- collocatesQuery(
118 kco,
119 node,
120 vc = vc,
121 minOccur = minOccur,
122 leftContextSize = leftContextSize,
123 rightContextSize = rightContextSize,
124 searchHitsSampleLimit = searchHitsSampleLimit,
125 ignoreCollocateCase = ignoreCollocateCase,
126 stopwords = stopwords,
127 ...
128 )
129
130 if (nrow(candidates) > 0) {
131 candidates <- candidates %>%
132 filter(frequency >= minOccur) %>%
133 head(topCollocatesLimit)
134 collocationScoreQuery(
135 kco,
136 node = node,
137 collocate = candidates$word,
138 vc = vc,
139 leftContextSize = leftContextSize,
140 rightContextSize = rightContextSize,
141 observed = if (exactFrequencies) NA else candidates$frequency,
142 ignoreCollocateCase = ignoreCollocateCase,
143 withinSpan = withinSpan,
144 ...
145 ) %>%
146 filter(.$O >= minOccur) %>%
147 dplyr::arrange(dplyr::desc(logDice))
148 } else {
149 tibble()
150 }
151 }
152 }
153)
154
155#' @importFrom magrittr debug_pipe
156#' @importFrom stringr str_match str_split str_detect
157#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
158#'
159snippet2FreqTable <- function(snippet,
160 minOccur = 5,
161 leftContextSize = 5,
162 rightContextSize = 5,
163 ignoreCollocateCase = FALSE,
164 stopwords = c(),
165 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
166 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
167 verbose = TRUE) {
168 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
169 frequency <- NULL
170
171 if (length(snippet) < 1) {
172 dplyr::tibble(word=c(), frequency=c())
173 } else if (length(snippet) > 1) {
Marc Kupietzd07bf192021-09-04 20:24:44 +0200174 log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200175 for (s in snippet) {
176 oldTable <- snippet2FreqTable(
177 s,
178 leftContextSize = leftContextSize,
179 rightContextSize = rightContextSize,
180 oldTable = oldTable,
181 stopwords = stopwords
182 )
183 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200184 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200185 oldTable %>%
186 group_by(word) %>%
187 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
188 summarise(frequency=sum(frequency), .groups = "drop") %>%
189 arrange(desc(frequency))
190 } else {
191 stopwordsTable <- dplyr::tibble(word=stopwords)
192 match <-
193 str_match(
194 snippet,
195 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
196 )
197
198 left <- if(leftContextSize > 0)
199 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
200 else
201 ""
202# cat(paste("left:", left, "\n", collapse=" "))
203
204 right <- if(rightContextSize > 0)
205 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
206 else
207 ""
208# cat(paste("right:", right, "\n", collapse=" "))
209
210 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
211 oldTable
212 } else {
213 table(c(left, right)) %>%
214 dplyr::as_tibble(.name_repair = "minimal") %>%
215 dplyr::rename(word = 1, frequency = 2) %>%
216 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
217 dplyr::anti_join(stopwordsTable, by="word") %>%
218 dplyr::bind_rows(oldTable)
219 }
220 }
221}
222
223#' Preliminary synsemantic stopwords function
224#'
225#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200226#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200227#'
228#' Preliminary synsemantic stopwords function to be used in collocation analysis.
229#'
230#' @details
231#' Currently only suitable for German. See stopwords package for other languages.
232#'
233#' @param ... future arguments for language detection
234#'
235#' @family collocation analysis functions
236#' @return Vector of synsemantic stopwords.
237#' @export
238synsemanticStopwords <- function(...) {
239 res <- c(
240 "der",
241 "die",
242 "und",
243 "in",
244 "den",
245 "von",
246 "mit",
247 "das",
248 "zu",
249 "im",
250 "ist",
251 "auf",
252 "sich",
253 "Die",
254 "des",
255 "dem",
256 "nicht",
257 "ein",
258 "eine",
259 "es",
260 "auch",
261 "an",
262 "als",
263 "am",
264 "aus",
265 "Der",
266 "bei",
267 "er",
268 "dass",
269 "sie",
270 "nach",
271 "um",
272 "Das",
273 "zum",
274 "noch",
275 "war",
276 "einen",
277 "einer",
278 "wie",
279 "einem",
280 "vor",
281 "bis",
282 "\u00fcber",
283 "so",
284 "aber",
285 "Eine",
286 "diese",
287 "Diese",
288 "oder"
289 )
290 return(res)
291}
292
293collocatesQuery <-
294 function(kco,
295 query,
296 vc = "",
297 minOccur = 5,
298 leftContextSize = 5,
299 rightContextSize = 5,
300 searchHitsSampleLimit = 20000,
301 ignoreCollocateCase = FALSE,
302 stopwords = c(),
303 ...) {
304 frequency <- NULL
305 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
306 if(q@totalResults == 0) {
307 tibble(word=c(), frequency=c())
308 } else {
309 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
310 snippet2FreqTable((q@collectedMatches)$snippet,
311 minOccur = minOccur,
312 leftContextSize = leftContextSize,
313 rightContextSize = rightContextSize,
314 ignoreCollocateCase = ignoreCollocateCase,
315 stopwords = stopwords,
316 verbose = kco@verbose) %>%
317 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
318 filter(frequency >= minOccur)
319 }
320 }