blob: c14fdcbca09fad383715002f993dd42361ce9b3e [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +02008#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +02009#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020029#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]`
Marc Kupietzdbd431a2021-08-29 12:17:45 +020030#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' @param expand if TRUE, `node` and `vc` parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to [collocationScoreQuery()]
Marc Kupietzdbd431a2021-08-29 12:17:45 +020038#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
45#'
46#' @examples
47#' \donttest{
48#' # Find top collocates of "Packung" inside and outside the sports domain.
49#' new("KorAPConnection", verbose = TRUE) %>%
50#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
51#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
52#' dplyr::filter(logDice >= 5)
53#' }
54#'
55#' \donttest{
56#' # Identify the most prominent light verb construction with "in ... setzen".
57#' # Note that, currently, the use of focus function disallows exactFrequencies.
58#' new("KorAPConnection", verbose = TRUE) %>%
59#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
60#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
61#' }
62#'
63#' @export
64setMethod("collocationAnalysis", "KorAPConnection",
65 function(kco,
66 node,
67 vc = "",
68 lemmatizeNodeQuery = FALSE,
69 minOccur = 5,
70 leftContextSize = 5,
71 rightContextSize = 5,
72 topCollocatesLimit = 200,
73 searchHitsSampleLimit = 20000,
74 ignoreCollocateCase = FALSE,
75 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
76 exactFrequencies = TRUE,
77 stopwords = RKorAPClient::synsemanticStopwords(),
78 seed = 7,
79 expand = length(vc) != length(node),
80 ...) {
81 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
82 word <- frequency <- NULL
83
84 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
85 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
86 }
87
Marc Kupietz581a29b2021-09-04 20:51:04 +020088 warnIfNoAccessToken(kco)
89
Marc Kupietzdbd431a2021-08-29 12:17:45 +020090 if (lemmatizeNodeQuery) {
91 node <- lemmatizeWordQuery(node)
92 }
93
94 if (length(node) > 1 || length(vc) > 1) {
95 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
96 purrr::pmap(grid, function(node, vc, ...)
97 collocationAnalysis(kco,
98 node =node,
99 vc = vc,
100 minOccur = minOccur,
101 leftContextSize = leftContextSize,
102 rightContextSize = rightContextSize,
103 topCollocatesLimit = topCollocatesLimit,
104 searchHitsSampleLimit = searchHitsSampleLimit,
105 ignoreCollocateCase = ignoreCollocateCase,
106 withinSpan = withinSpan,
107 exactFrequencies = exactFrequencies,
108 stopwords = stopwords,
109 seed = seed,
110 expand = expand,
111 ...) ) %>%
112 bind_rows()
113 } else {
114 set.seed(seed)
115 candidates <- collocatesQuery(
116 kco,
117 node,
118 vc = vc,
119 minOccur = minOccur,
120 leftContextSize = leftContextSize,
121 rightContextSize = rightContextSize,
122 searchHitsSampleLimit = searchHitsSampleLimit,
123 ignoreCollocateCase = ignoreCollocateCase,
124 stopwords = stopwords,
125 ...
126 )
127
128 if (nrow(candidates) > 0) {
129 candidates <- candidates %>%
130 filter(frequency >= minOccur) %>%
131 head(topCollocatesLimit)
132 collocationScoreQuery(
133 kco,
134 node = node,
135 collocate = candidates$word,
136 vc = vc,
137 leftContextSize = leftContextSize,
138 rightContextSize = rightContextSize,
139 observed = if (exactFrequencies) NA else candidates$frequency,
140 ignoreCollocateCase = ignoreCollocateCase,
141 withinSpan = withinSpan,
142 ...
143 ) %>%
144 filter(.$O >= minOccur) %>%
145 dplyr::arrange(dplyr::desc(logDice))
146 } else {
147 tibble()
148 }
149 }
150 }
151)
152
153#' @importFrom magrittr debug_pipe
154#' @importFrom stringr str_match str_split str_detect
155#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
156#'
157snippet2FreqTable <- function(snippet,
158 minOccur = 5,
159 leftContextSize = 5,
160 rightContextSize = 5,
161 ignoreCollocateCase = FALSE,
162 stopwords = c(),
163 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
164 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
165 verbose = TRUE) {
166 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
167 frequency <- NULL
168
169 if (length(snippet) < 1) {
170 dplyr::tibble(word=c(), frequency=c())
171 } else if (length(snippet) > 1) {
Marc Kupietzd07bf192021-09-04 20:24:44 +0200172 log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200173 for (s in snippet) {
174 oldTable <- snippet2FreqTable(
175 s,
176 leftContextSize = leftContextSize,
177 rightContextSize = rightContextSize,
178 oldTable = oldTable,
179 stopwords = stopwords
180 )
181 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200182 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200183 oldTable %>%
184 group_by(word) %>%
185 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
186 summarise(frequency=sum(frequency), .groups = "drop") %>%
187 arrange(desc(frequency))
188 } else {
189 stopwordsTable <- dplyr::tibble(word=stopwords)
190 match <-
191 str_match(
192 snippet,
193 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
194 )
195
196 left <- if(leftContextSize > 0)
197 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
198 else
199 ""
200# cat(paste("left:", left, "\n", collapse=" "))
201
202 right <- if(rightContextSize > 0)
203 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
204 else
205 ""
206# cat(paste("right:", right, "\n", collapse=" "))
207
208 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
209 oldTable
210 } else {
211 table(c(left, right)) %>%
212 dplyr::as_tibble(.name_repair = "minimal") %>%
213 dplyr::rename(word = 1, frequency = 2) %>%
214 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
215 dplyr::anti_join(stopwordsTable, by="word") %>%
216 dplyr::bind_rows(oldTable)
217 }
218 }
219}
220
221#' Preliminary synsemantic stopwords function
222#'
223#' @description
Marc Kupietz67edcb52021-09-20 21:54:24 +0200224#' `r lifecycle::badge("experimental")`
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200225#'
226#' Preliminary synsemantic stopwords function to be used in collocation analysis.
227#'
228#' @details
229#' Currently only suitable for German. See stopwords package for other languages.
230#'
231#' @param ... future arguments for language detection
232#'
233#' @family collocation analysis functions
234#' @return Vector of synsemantic stopwords.
235#' @export
236synsemanticStopwords <- function(...) {
237 res <- c(
238 "der",
239 "die",
240 "und",
241 "in",
242 "den",
243 "von",
244 "mit",
245 "das",
246 "zu",
247 "im",
248 "ist",
249 "auf",
250 "sich",
251 "Die",
252 "des",
253 "dem",
254 "nicht",
255 "ein",
256 "eine",
257 "es",
258 "auch",
259 "an",
260 "als",
261 "am",
262 "aus",
263 "Der",
264 "bei",
265 "er",
266 "dass",
267 "sie",
268 "nach",
269 "um",
270 "Das",
271 "zum",
272 "noch",
273 "war",
274 "einen",
275 "einer",
276 "wie",
277 "einem",
278 "vor",
279 "bis",
280 "\u00fcber",
281 "so",
282 "aber",
283 "Eine",
284 "diese",
285 "Diese",
286 "oder"
287 )
288 return(res)
289}
290
291collocatesQuery <-
292 function(kco,
293 query,
294 vc = "",
295 minOccur = 5,
296 leftContextSize = 5,
297 rightContextSize = 5,
298 searchHitsSampleLimit = 20000,
299 ignoreCollocateCase = FALSE,
300 stopwords = c(),
301 ...) {
302 frequency <- NULL
303 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
304 if(q@totalResults == 0) {
305 tibble(word=c(), frequency=c())
306 } else {
307 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
308 snippet2FreqTable((q@collectedMatches)$snippet,
309 minOccur = minOccur,
310 leftContextSize = leftContextSize,
311 rightContextSize = rightContextSize,
312 ignoreCollocateCase = ignoreCollocateCase,
313 stopwords = stopwords,
314 verbose = kco@verbose) %>%
315 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
316 filter(frequency >= minOccur)
317 }
318 }