blob: f01c7033c206e6b53b2989715366fc85dad48759 [file] [log] [blame]
Marc Kupietzdbd431a2021-08-29 12:17:45 +02001setGeneric("collocationAnalysis", function(kco, ...) standardGeneric("collocationAnalysis") )
2
3#' Collocation analysis
4#'
5#' @aliases collocationAnalysis
6#'
7#' @description
8#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
9#'
10#' Performs a collocation analysis for the given node (or query)
11#' in the given virtual corpus.
12#'
13#' @details
14#' The collocation analysis is currently implemented on the client side, as some of the
15#' functionality is not yet provided by the KorAP backend. Mainly for this reason
16#' it is very slow (several minutes, up to hours), but on the other hand very flexible.
17#' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries,
18#' and look for expression-internal collocates using the focus function (see examples and demo).
19#'
20#' To increase speed at the cost of accuracy and possible false negatives,
21#' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE.
22#'
23#' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one.
24#' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web
25#' user interface.
26#'
27#' @family collocation analysis functions
28#'
29#' @param lemmatizeNodeQuery if TRUE, node query will be lemmatized, i.e. x -> [tt/l=x]
30#' @param minOccur minimum absolute number of observed co-occurrences to consider a collocate candidate
31#' @param topCollocatesLimit limit analysis to the n most frequent collocates in the search hits sample
32#' @param searchHitsSampleLimit limit the size of the search hits sample
33#' @param stopwords vector of stopwords not to be considered as collocates
34#' @param exactFrequencies if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies
35#' @param seed seed for random page collecting order
36#' @param expand if TRUE, \code{node} and \code{vc} parameters are expanded to all of their combinations
37#' @param ... more arguments will be passed to \code{\link{collocationScoreQuery}}
38#' @inheritParams collocationScoreQuery,KorAPConnection-method
39#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
40#'
41#' @importFrom stringr str_match str_split str_detect
42#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
43#' @importFrom purrr pmap
44#' @importFrom tidyr expand_grid
Marc Kupietz427d9ed2021-09-04 21:17:53 +020045#' @importFrom lifecycle badge
Marc Kupietzdbd431a2021-08-29 12:17:45 +020046#'
47#' @examples
48#' \donttest{
49#' # Find top collocates of "Packung" inside and outside the sports domain.
50#' new("KorAPConnection", verbose = TRUE) %>%
51#' collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"),
52#' leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>%
53#' dplyr::filter(logDice >= 5)
54#' }
55#'
56#' \donttest{
57#' # Identify the most prominent light verb construction with "in ... setzen".
58#' # Note that, currently, the use of focus function disallows exactFrequencies.
59#' new("KorAPConnection", verbose = TRUE) %>%
60#' collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})",
61#' leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20)
62#' }
63#'
64#' @export
65setMethod("collocationAnalysis", "KorAPConnection",
66 function(kco,
67 node,
68 vc = "",
69 lemmatizeNodeQuery = FALSE,
70 minOccur = 5,
71 leftContextSize = 5,
72 rightContextSize = 5,
73 topCollocatesLimit = 200,
74 searchHitsSampleLimit = 20000,
75 ignoreCollocateCase = FALSE,
76 withinSpan = ifelse(exactFrequencies, "base/s=s", ""),
77 exactFrequencies = TRUE,
78 stopwords = RKorAPClient::synsemanticStopwords(),
79 seed = 7,
80 expand = length(vc) != length(node),
81 ...) {
82 # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
83 word <- frequency <- NULL
84
85 if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) {
86 stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE)
87 }
88
Marc Kupietz581a29b2021-09-04 20:51:04 +020089 warnIfNoAccessToken(kco)
90
Marc Kupietzdbd431a2021-08-29 12:17:45 +020091 if (lemmatizeNodeQuery) {
92 node <- lemmatizeWordQuery(node)
93 }
94
95 if (length(node) > 1 || length(vc) > 1) {
96 grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc)
97 purrr::pmap(grid, function(node, vc, ...)
98 collocationAnalysis(kco,
99 node =node,
100 vc = vc,
101 minOccur = minOccur,
102 leftContextSize = leftContextSize,
103 rightContextSize = rightContextSize,
104 topCollocatesLimit = topCollocatesLimit,
105 searchHitsSampleLimit = searchHitsSampleLimit,
106 ignoreCollocateCase = ignoreCollocateCase,
107 withinSpan = withinSpan,
108 exactFrequencies = exactFrequencies,
109 stopwords = stopwords,
110 seed = seed,
111 expand = expand,
112 ...) ) %>%
113 bind_rows()
114 } else {
115 set.seed(seed)
116 candidates <- collocatesQuery(
117 kco,
118 node,
119 vc = vc,
120 minOccur = minOccur,
121 leftContextSize = leftContextSize,
122 rightContextSize = rightContextSize,
123 searchHitsSampleLimit = searchHitsSampleLimit,
124 ignoreCollocateCase = ignoreCollocateCase,
125 stopwords = stopwords,
126 ...
127 )
128
129 if (nrow(candidates) > 0) {
130 candidates <- candidates %>%
131 filter(frequency >= minOccur) %>%
132 head(topCollocatesLimit)
133 collocationScoreQuery(
134 kco,
135 node = node,
136 collocate = candidates$word,
137 vc = vc,
138 leftContextSize = leftContextSize,
139 rightContextSize = rightContextSize,
140 observed = if (exactFrequencies) NA else candidates$frequency,
141 ignoreCollocateCase = ignoreCollocateCase,
142 withinSpan = withinSpan,
143 ...
144 ) %>%
145 filter(.$O >= minOccur) %>%
146 dplyr::arrange(dplyr::desc(logDice))
147 } else {
148 tibble()
149 }
150 }
151 }
152)
153
154#' @importFrom magrittr debug_pipe
155#' @importFrom stringr str_match str_split str_detect
156#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
157#'
158snippet2FreqTable <- function(snippet,
159 minOccur = 5,
160 leftContextSize = 5,
161 rightContextSize = 5,
162 ignoreCollocateCase = FALSE,
163 stopwords = c(),
164 tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|&quot;)",
165 oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
166 verbose = TRUE) {
167 word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
168 frequency <- NULL
169
170 if (length(snippet) < 1) {
171 dplyr::tibble(word=c(), frequency=c())
172 } else if (length(snippet) > 1) {
Marc Kupietzd07bf192021-09-04 20:24:44 +0200173 log.info(verbose, paste("Joinging", length(snippet), "kwics\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200174 for (s in snippet) {
175 oldTable <- snippet2FreqTable(
176 s,
177 leftContextSize = leftContextSize,
178 rightContextSize = rightContextSize,
179 oldTable = oldTable,
180 stopwords = stopwords
181 )
182 }
Marc Kupietzd07bf192021-09-04 20:24:44 +0200183 log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
Marc Kupietzdbd431a2021-08-29 12:17:45 +0200184 oldTable %>%
185 group_by(word) %>%
186 mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
187 summarise(frequency=sum(frequency), .groups = "drop") %>%
188 arrange(desc(frequency))
189 } else {
190 stopwordsTable <- dplyr::tibble(word=stopwords)
191 match <-
192 str_match(
193 snippet,
194 '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)'
195 )
196
197 left <- if(leftContextSize > 0)
198 tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize)
199 else
200 ""
201# cat(paste("left:", left, "\n", collapse=" "))
202
203 right <- if(rightContextSize > 0)
204 head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize)
205 else
206 ""
207# cat(paste("right:", right, "\n", collapse=" "))
208
209 if(is.na(left) || is.na(right) || length(left) + length(right) == 0) {
210 oldTable
211 } else {
212 table(c(left, right)) %>%
213 dplyr::as_tibble(.name_repair = "minimal") %>%
214 dplyr::rename(word = 1, frequency = 2) %>%
215 dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>%
216 dplyr::anti_join(stopwordsTable, by="word") %>%
217 dplyr::bind_rows(oldTable)
218 }
219 }
220}
221
222#' Preliminary synsemantic stopwords function
223#'
224#' @description
225#' \Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
226#'
227#' Preliminary synsemantic stopwords function to be used in collocation analysis.
228#'
229#' @details
230#' Currently only suitable for German. See stopwords package for other languages.
231#'
232#' @param ... future arguments for language detection
233#'
234#' @family collocation analysis functions
235#' @return Vector of synsemantic stopwords.
236#' @export
237synsemanticStopwords <- function(...) {
238 res <- c(
239 "der",
240 "die",
241 "und",
242 "in",
243 "den",
244 "von",
245 "mit",
246 "das",
247 "zu",
248 "im",
249 "ist",
250 "auf",
251 "sich",
252 "Die",
253 "des",
254 "dem",
255 "nicht",
256 "ein",
257 "eine",
258 "es",
259 "auch",
260 "an",
261 "als",
262 "am",
263 "aus",
264 "Der",
265 "bei",
266 "er",
267 "dass",
268 "sie",
269 "nach",
270 "um",
271 "Das",
272 "zum",
273 "noch",
274 "war",
275 "einen",
276 "einer",
277 "wie",
278 "einem",
279 "vor",
280 "bis",
281 "\u00fcber",
282 "so",
283 "aber",
284 "Eine",
285 "diese",
286 "Diese",
287 "oder"
288 )
289 return(res)
290}
291
292collocatesQuery <-
293 function(kco,
294 query,
295 vc = "",
296 minOccur = 5,
297 leftContextSize = 5,
298 rightContextSize = 5,
299 searchHitsSampleLimit = 20000,
300 ignoreCollocateCase = FALSE,
301 stopwords = c(),
302 ...) {
303 frequency <- NULL
304 q <- corpusQuery(kco, query, vc, metadataOnly = F, ...)
305 if(q@totalResults == 0) {
306 tibble(word=c(), frequency=c())
307 } else {
308 q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
309 snippet2FreqTable((q@collectedMatches)$snippet,
310 minOccur = minOccur,
311 leftContextSize = leftContextSize,
312 rightContextSize = rightContextSize,
313 ignoreCollocateCase = ignoreCollocateCase,
314 stopwords = stopwords,
315 verbose = kco@verbose) %>%
316 mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>%
317 filter(frequency >= minOccur)
318 }
319 }