blob: 2443f8fe10459fc7a07d80e4d41234d0f3dffa27 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
8#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
9#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
29#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
30#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
36#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
38#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
45#'
46#' @examples
47#' \donttest{
48#' # Find top collocates of "Packung" inside and outside the sports domain.
49#' new("KorAPConnection", verbose = TRUE) %>%
50#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
51#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
52#' dplyr::filter(logDice >= 5)
53#' }
54#'
55#' \donttest{
56#' # Identify the most prominent light verb construction with "in ... setzen".
57#' # Note that, currently, the use of focus function disallows exactFrequencies.
58#' new("KorAPConnection", verbose = TRUE) %>%
59#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
60#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
61#' }
62#'
63#' @export
64setMethod("collocationAnalysis", "KorAPConnection",
65 function(kco,
66 node,
67 vc = "",
68 lemmatizeNodeQuery = FALSE,
69 minOccur = 5,
70 leftContextSize = 5,
71 rightContextSize = 5,
72 topCollocatesLimit = 200,
73 searchHitsSampleLimit = 20000,
74 ignoreCollocateCase = FALSE,
75 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
76 exactFrequencies = TRUE,
77 stopwords = RKorAPClient::synsemanticStopwords(),
78 seed = 7,
79 expand = length(vc) != length(node),
80 ...) {
81 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
82 word <- frequency <- NULL
83
84 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
85 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
86 }
87
88 if (lemmatizeNodeQuery) {
89 node <- lemmatizeWordQuery(node)
90 }
91
92 if (length(node) > 1 || length(vc) > 1) {
93 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
94 purrr::pmap(grid, function(node, vc, ...)
95 collocationAnalysis(kco,
96 node =node,
97 vc = vc,
98 minOccur = minOccur,
99 leftContextSize = leftContextSize,
100 rightContextSize = rightContextSize,
101 topCollocatesLimit = topCollocatesLimit,
102 searchHitsSampleLimit = searchHitsSampleLimit,
103 ignoreCollocateCase = ignoreCollocateCase,
104 withinSpan = withinSpan,
105 exactFrequencies = exactFrequencies,
106 stopwords = stopwords,
107 seed = seed,
108 expand = expand,
109 ...) ) %>%
110 bind_rows()
111 } else {
112 set.seed(seed)
113 candidates <- collocatesQuery(
114 kco,
115 node,
116 vc = vc,
117 minOccur = minOccur,
118 leftContextSize = leftContextSize,
119 rightContextSize = rightContextSize,
120 searchHitsSampleLimit = searchHitsSampleLimit,
121 ignoreCollocateCase = ignoreCollocateCase,
122 stopwords = stopwords,
123 ...
124 )
125
126 if (nrow(candidates) > 0) {
127 candidates <- candidates %>%
128 filter(frequency >= minOccur) %>%
129 head(topCollocatesLimit)
130 collocationScoreQuery(
131 kco,
132 node = node,
133 collocate = candidates$word,
134 vc = vc,
135 leftContextSize = leftContextSize,
136 rightContextSize = rightContextSize,
137 observed = if (exactFrequencies) NA else candidates$frequency,
138 ignoreCollocateCase = ignoreCollocateCase,
139 withinSpan = withinSpan,
140 ...
141 ) %>%
142 filter(.$O >= minOccur) %>%
143 dplyr::arrange(dplyr::desc(logDice))
144 } else {
145 tibble()
146 }
147 }
148 }
149)
150
151#' @importFrom magrittr debug_pipe
152#' @importFrom stringr str_match str_split str_detect
153#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
154#'
155snippet2FreqTable <- function(snippet,
156 minOccur = 5,
157 leftContextSize = 5,
158 rightContextSize = 5,
159 ignoreCollocateCase = FALSE,
160 stopwords = c(),
161 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
162 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
163 verbose = TRUE) {
164 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
165 frequency <- NULL
166
167 if (length(snippet) < 1) {
168 dplyr::tibble(word=c(), frequency=c())
169 } else if (length(snippet) > 1) {
Marc Kupietzd07bf192021-09-04 20:24:44 +0200170 log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200171 for (s in snippet) {
172 oldTable <- snippet2FreqTable(
173 s,
174 leftContextSize = leftContextSize,
175 rightContextSize = rightContextSize,
176 oldTable = oldTable,
177 stopwords = stopwords
178 )
179 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200180 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200181 oldTable %>%
182 group_by(word) %>%
183 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
184 summarise(frequency=sum(frequency), .groups = "drop") %>%
185 arrange(desc(frequency))
186 } else {
187 stopwordsTable <- dplyr::tibble(word=stopwords)
188 match <-
189 str_match(
190 snippet,
191 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
192 )
193
194 left <- if(leftContextSize > 0)
195 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
196 else
197 ""
198# cat(paste("left:", left, "\n", collapse=" "))
199
200 right <- if(rightContextSize > 0)
201 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
202 else
203 ""
204# cat(paste("right:", right, "\n", collapse=" "))
205
206 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
207 oldTable
208 } else {
209 table(c(left, right)) %>%
210 dplyr::as_tibble(.name_repair = "minimal") %>%
211 dplyr::rename(word = 1, frequency = 2) %>%
212 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
213 dplyr::anti_join(stopwordsTable, by="word") %>%
214 dplyr::bind_rows(oldTable)
215 }
216 }
217}
218
219#' Preliminary synsemantic stopwords function
220#'
221#' @description
222#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
223#'
224#' Preliminary synsemantic stopwords function to be used in collocation analysis.
225#'
226#' @details
227#' Currently only suitable for German. See stopwords package for other languages.
228#'
229#' @param ... future arguments for language detection
230#'
231#' @family collocation analysis functions
232#' @return Vector of synsemantic stopwords.
233#' @export
234synsemanticStopwords <- function(...) {
235 res <- c(
236 "der",
237 "die",
238 "und",
239 "in",
240 "den",
241 "von",
242 "mit",
243 "das",
244 "zu",
245 "im",
246 "ist",
247 "auf",
248 "sich",
249 "Die",
250 "des",
251 "dem",
252 "nicht",
253 "ein",
254 "eine",
255 "es",
256 "auch",
257 "an",
258 "als",
259 "am",
260 "aus",
261 "Der",
262 "bei",
263 "er",
264 "dass",
265 "sie",
266 "nach",
267 "um",
268 "Das",
269 "zum",
270 "noch",
271 "war",
272 "einen",
273 "einer",
274 "wie",
275 "einem",
276 "vor",
277 "bis",
278 "\u00fcber",
279 "so",
280 "aber",
281 "Eine",
282 "diese",
283 "Diese",
284 "oder"
285 )
286 return(res)
287}
288
289collocatesQuery <-
290 function(kco,
291 query,
292 vc = "",
293 minOccur = 5,
294 leftContextSize = 5,
295 rightContextSize = 5,
296 searchHitsSampleLimit = 20000,
297 ignoreCollocateCase = FALSE,
298 stopwords = c(),
299 ...) {
300 frequency <- NULL
301 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
302 if(q@totalResults == 0) {
303 tibble(word=c(), frequency=c())
304 } else {
305 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
306 snippet2FreqTable((q@collectedMatches)$snippet,
307 minOccur = minOccur,
308 leftContextSize = leftContextSize,
309 rightContextSize = rightContextSize,
310 ignoreCollocateCase = ignoreCollocateCase,
311 stopwords = stopwords,
312 verbose = kco@verbose) %>%
313 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
314 filter(frequency >= minOccur)
315 }
316 }
317
318