| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 1 | setGeneric("collocationAnalysis", function(kco, ...)  standardGeneric("collocationAnalysis") ) | 
|  | 2 |  | 
|  | 3 | #' Collocation analysis | 
|  | 4 | #' | 
|  | 5 | #' @aliases collocationAnalysis | 
|  | 6 | #' | 
|  | 7 | #' @description | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 8 | #' `r lifecycle::badge("experimental")` | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 9 | #' | 
|  | 10 | #' Performs a collocation analysis for the given node (or query) | 
|  | 11 | #' in the given virtual corpus. | 
|  | 12 | #' | 
|  | 13 | #' @details | 
|  | 14 | #' The collocation analysis is currently implemented on the client side, as some of the | 
|  | 15 | #' functionality is not yet provided by the KorAP backend. Mainly for this reason | 
|  | 16 | #' it is very slow (several minutes, up to hours), but on the other hand very flexible. | 
|  | 17 | #' You can, for example, perform the analysis in arbitrary virtual corpora, use complex node queries, | 
|  | 18 | #' and look for expression-internal collocates using the focus function (see examples and demo). | 
|  | 19 | #' | 
|  | 20 | #' To increase speed at the cost of accuracy and possible false negatives, | 
|  | 21 | #' you can decrease searchHitsSampleLimit and/or topCollocatesLimit and/or set exactFrequencies to FALSE. | 
|  | 22 | #' | 
|  | 23 | #' Note that currently not the tokenization provided by the backend, i.e. the corpus itself, is used, but a tinkered one. | 
|  | 24 | #' This can also lead to false negatives and to frequencies that differ from corresponding ones acquired via the web | 
|  | 25 | #' user interface. | 
|  | 26 | #' | 
|  | 27 | #' @family collocation analysis functions | 
|  | 28 | #' | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 29 | #' @param lemmatizeNodeQuery     if TRUE, node query will be lemmatized, i.e. `x -> [tt/l=x]` | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 30 | #' @param minOccur               minimum absolute number of observed co-occurrences to consider a collocate candidate | 
|  | 31 | #' @param topCollocatesLimit     limit analysis to the n most frequent collocates in the search hits sample | 
|  | 32 | #' @param searchHitsSampleLimit  limit the size of the search hits sample | 
|  | 33 | #' @param stopwords              vector of stopwords not to be considered as collocates | 
|  | 34 | #' @param exactFrequencies       if FALSE, extrapolate observed co-occurrence frequencies from frequencies in search hits sample, otherwise retrieve exact co-occurrence frequencies | 
|  | 35 | #' @param seed                   seed for random page collecting order | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 36 | #' @param expand                 if TRUE, `node` and `vc` parameters are expanded to all of their combinations | 
|  | 37 | #' @param ...                    more arguments will be passed to [collocationScoreQuery()] | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 38 | #' @inheritParams collocationScoreQuery,KorAPConnection-method | 
|  | 39 | #' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc. | 
|  | 40 | #' | 
|  | 41 | #' @importFrom stringr str_match str_split str_detect | 
|  | 42 | #' @importFrom dplyr anti_join arrange desc slice_head bind_rows | 
|  | 43 | #' @importFrom purrr pmap | 
|  | 44 | #' @importFrom tidyr expand_grid | 
|  | 45 | #' | 
|  | 46 | #' @examples | 
| Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 47 | #' \dontrun{ | 
|  | 48 | #' | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 49 | #'  # Find top collocates of "Packung" inside and outside the sports domain. | 
|  | 50 | #'  new("KorAPConnection", verbose = TRUE) %>% | 
|  | 51 | #'   collocationAnalysis("Packung", vc=c("textClass=sport", "textClass!=sport"), | 
|  | 52 | #'                       leftContextSize=1, rightContextSize=1, topCollocatesLimit=20) %>% | 
|  | 53 | #'   dplyr::filter(logDice >= 5) | 
|  | 54 | #' } | 
|  | 55 | #' | 
| Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 56 | #' \dontrun{ | 
|  | 57 | #' | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 58 | #' # Identify the most prominent light verb construction with "in ... setzen". | 
|  | 59 | #' # Note that, currently, the use of focus function disallows exactFrequencies. | 
|  | 60 | #' new("KorAPConnection", verbose = TRUE) %>% | 
|  | 61 | #'   collocationAnalysis("focus(in [tt/p=NN] {[tt/l=setzen]})", | 
|  | 62 | #'     leftContextSize=1, rightContextSize=0, exactFrequencies=FALSE, topCollocatesLimit=20) | 
|  | 63 | #' } | 
|  | 64 | #' | 
|  | 65 | #' @export | 
|  | 66 | setMethod("collocationAnalysis", "KorAPConnection", | 
|  | 67 | function(kco, | 
|  | 68 | node, | 
|  | 69 | vc = "", | 
|  | 70 | lemmatizeNodeQuery = FALSE, | 
|  | 71 | minOccur = 5, | 
|  | 72 | leftContextSize = 5, | 
|  | 73 | rightContextSize = 5, | 
|  | 74 | topCollocatesLimit = 200, | 
|  | 75 | searchHitsSampleLimit = 20000, | 
|  | 76 | ignoreCollocateCase = FALSE, | 
|  | 77 | withinSpan = ifelse(exactFrequencies, "base/s=s", ""), | 
|  | 78 | exactFrequencies = TRUE, | 
|  | 79 | stopwords = RKorAPClient::synsemanticStopwords(), | 
|  | 80 | seed = 7, | 
|  | 81 | expand = length(vc) != length(node), | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 82 | maxRecurse = 0, | 
|  | 83 | addExamples = TRUE, | 
| Marc Kupietz | 419f21f | 2021-12-07 10:27:30 +0100 | [diff] [blame] | 84 | thresholdScore = "logDice", | 
|  | 85 | threshold = 2.0, | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 86 | localStopwords = c(), | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 87 | ...) { | 
|  | 88 | # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check | 
|  | 89 | word <- frequency <- NULL | 
|  | 90 |  | 
|  | 91 | if(!exactFrequencies && (!is.na(withinSpan) && !is.null(withinSpan) && nchar(withinSpan)>0 )) { | 
|  | 92 | stop(sprintf("Not empty withinSpan (='%s') requires exactFrequencies=TRUE", withinSpan), call. = FALSE) | 
|  | 93 | } | 
|  | 94 |  | 
| Marc Kupietz | 581a29b | 2021-09-04 20:51:04 +0200 | [diff] [blame] | 95 | warnIfNoAccessToken(kco) | 
|  | 96 |  | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 97 | if (lemmatizeNodeQuery) { | 
|  | 98 | node <- lemmatizeWordQuery(node) | 
|  | 99 | } | 
|  | 100 |  | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 101 | result <- if (length(node) > 1 || length(vc) > 1) { | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 102 | grid <- if (expand) expand_grid(node=node, vc=vc) else tibble(node=node, vc=vc) | 
|  | 103 | purrr::pmap(grid, function(node, vc, ...) | 
|  | 104 | collocationAnalysis(kco, | 
|  | 105 | node =node, | 
|  | 106 | vc = vc, | 
|  | 107 | minOccur = minOccur, | 
|  | 108 | leftContextSize = leftContextSize, | 
|  | 109 | rightContextSize = rightContextSize, | 
|  | 110 | topCollocatesLimit = topCollocatesLimit, | 
|  | 111 | searchHitsSampleLimit = searchHitsSampleLimit, | 
|  | 112 | ignoreCollocateCase = ignoreCollocateCase, | 
|  | 113 | withinSpan = withinSpan, | 
|  | 114 | exactFrequencies = exactFrequencies, | 
|  | 115 | stopwords = stopwords, | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 116 | addExamples = TRUE, | 
|  | 117 | localStopwords = localStopwords, | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 118 | seed = seed, | 
|  | 119 | expand = expand, | 
|  | 120 | ...) ) %>% | 
|  | 121 | bind_rows() | 
|  | 122 | } else { | 
|  | 123 | set.seed(seed) | 
|  | 124 | candidates <- collocatesQuery( | 
|  | 125 | kco, | 
|  | 126 | node, | 
|  | 127 | vc = vc, | 
|  | 128 | minOccur = minOccur, | 
|  | 129 | leftContextSize = leftContextSize, | 
|  | 130 | rightContextSize = rightContextSize, | 
|  | 131 | searchHitsSampleLimit = searchHitsSampleLimit, | 
|  | 132 | ignoreCollocateCase = ignoreCollocateCase, | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 133 | stopwords = append(stopwords, localStopwords), | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 134 | ... | 
|  | 135 | ) | 
|  | 136 |  | 
|  | 137 | if (nrow(candidates) > 0) { | 
|  | 138 | candidates <- candidates %>% | 
|  | 139 | filter(frequency >= minOccur) %>% | 
|  | 140 | head(topCollocatesLimit) | 
|  | 141 | collocationScoreQuery( | 
|  | 142 | kco, | 
|  | 143 | node = node, | 
|  | 144 | collocate = candidates$word, | 
|  | 145 | vc = vc, | 
|  | 146 | leftContextSize = leftContextSize, | 
|  | 147 | rightContextSize = rightContextSize, | 
|  | 148 | observed = if (exactFrequencies) NA else candidates$frequency, | 
|  | 149 | ignoreCollocateCase = ignoreCollocateCase, | 
|  | 150 | withinSpan = withinSpan, | 
|  | 151 | ... | 
|  | 152 | ) %>% | 
|  | 153 | filter(.$O >= minOccur) %>% | 
|  | 154 | dplyr::arrange(dplyr::desc(logDice)) | 
|  | 155 | } else { | 
|  | 156 | tibble() | 
|  | 157 | } | 
|  | 158 | } | 
| Marc Kupietz | 419f21f | 2021-12-07 10:27:30 +0100 | [diff] [blame] | 159 | if (maxRecurse > 0 & any(!!as.name(thresholdScore) >= threshold)) { | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 160 | recurseWith <- result %>% | 
| Marc Kupietz | 419f21f | 2021-12-07 10:27:30 +0100 | [diff] [blame] | 161 | filter(!!as.name(thresholdScore) >= threshold) | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 162 | result <- collocationAnalysis( | 
|  | 163 | kco, | 
|  | 164 | node = paste0("(", buildCollocationQuery( | 
|  | 165 | removeWithinSpan(recurseWith$node, withinSpan), | 
|  | 166 | recurseWith$collocate, | 
|  | 167 | leftContextSize = leftContextSize, | 
|  | 168 | rightContextSize = rightContextSize, | 
|  | 169 | withinSpan = "" | 
|  | 170 | ), ")"), | 
|  | 171 | vc = vc, | 
|  | 172 | minOccur = minOccur, | 
|  | 173 | leftContextSize = leftContextSize, | 
|  | 174 | rightContextSize = rightContextSize, | 
|  | 175 | withinSpan = withinSpan, | 
|  | 176 | maxRecurse = maxRecurse - 1, | 
|  | 177 | stopwords = stopwords, | 
|  | 178 | localStopwords = recurseWith$collocate, | 
|  | 179 | exactFrequencies = exactFrequencies, | 
|  | 180 | searchHitsSampleLimit = searchHitsSampleLimit, | 
|  | 181 | topCollocatesLimit = topCollocatesLimit, | 
|  | 182 | addExamples = FALSE | 
|  | 183 | ) %>% | 
|  | 184 | bind_rows(result) %>% | 
|  | 185 | filter(logDice >= 2) %>% | 
|  | 186 | filter(.$O >= minOccur) %>% | 
|  | 187 | dplyr::arrange(dplyr::desc(logDice)) | 
|  | 188 | } | 
|  | 189 | if (addExamples && length(result) > 0) { | 
| Marc Kupietz | 1678c3a | 2021-12-07 10:24:49 +0100 | [diff] [blame] | 190 | result$query <-buildCollocationQuery( | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 191 | result$node, | 
|  | 192 | result$collocate, | 
|  | 193 | leftContextSize = leftContextSize, | 
|  | 194 | rightContextSize = rightContextSize, | 
|  | 195 | withinSpan = "" | 
|  | 196 | ) | 
|  | 197 | result$example <- findExample( | 
|  | 198 | kco, | 
| Marc Kupietz | 1678c3a | 2021-12-07 10:24:49 +0100 | [diff] [blame] | 199 | query = result$query, | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 200 | vc = result$vc | 
|  | 201 | ) | 
|  | 202 | } | 
|  | 203 | result | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 204 | } | 
|  | 205 | ) | 
|  | 206 |  | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 207 | #' @export | 
|  | 208 | removeWithinSpan <- function(query, withinSpan) { | 
|  | 209 | if (withinSpan == "") { | 
|  | 210 | return(query) | 
|  | 211 | } | 
|  | 212 | needle <- sprintf("^\\(contains\\(<%s>, ?(.*)\\){2}$", withinSpan) | 
|  | 213 | res <- gsub(needle, '\\1', query) | 
|  | 214 | needle <- sprintf("^contains\\(<%s>, ?(.*)\\)$", withinSpan) | 
|  | 215 | res <- gsub(needle, '\\1', res) | 
|  | 216 | return(res) | 
|  | 217 | } | 
|  | 218 |  | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 219 | #' @importFrom magrittr debug_pipe | 
|  | 220 | #' @importFrom stringr str_match str_split str_detect | 
|  | 221 | #' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when | 
|  | 222 | #' | 
|  | 223 | snippet2FreqTable <- function(snippet, | 
|  | 224 | minOccur = 5, | 
|  | 225 | leftContextSize = 5, | 
|  | 226 | rightContextSize = 5, | 
|  | 227 | ignoreCollocateCase = FALSE, | 
|  | 228 | stopwords = c(), | 
|  | 229 | tokenizeRegex = "([! )(\uc2\uab,.:?\u201e\u201c\'\"]+|")", | 
|  | 230 | oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)), | 
|  | 231 | verbose = TRUE) { | 
|  | 232 | word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check | 
|  | 233 | frequency <- NULL | 
|  | 234 |  | 
|  | 235 | if (length(snippet) < 1) { | 
|  | 236 | dplyr::tibble(word=c(), frequency=c()) | 
|  | 237 | } else if (length(snippet) > 1) { | 
| Marc Kupietz | 8fe0bee | 2021-12-07 10:25:45 +0100 | [diff] [blame] | 238 | log.info(verbose, paste("Joining", length(snippet), "kwics\n")) | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 239 | for (s in snippet) { | 
|  | 240 | oldTable <- snippet2FreqTable( | 
|  | 241 | s, | 
|  | 242 | leftContextSize = leftContextSize, | 
|  | 243 | rightContextSize = rightContextSize, | 
|  | 244 | oldTable = oldTable, | 
|  | 245 | stopwords = stopwords | 
|  | 246 | ) | 
|  | 247 | } | 
| Marc Kupietz | d07bf19 | 2021-09-04 20:24:44 +0200 | [diff] [blame] | 248 | log.info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n")) | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 249 | oldTable  %>% | 
|  | 250 | group_by(word) %>% | 
|  | 251 | mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>% | 
|  | 252 | summarise(frequency=sum(frequency), .groups = "drop") %>% | 
|  | 253 | arrange(desc(frequency)) | 
|  | 254 | } else { | 
|  | 255 | stopwordsTable <- dplyr::tibble(word=stopwords) | 
|  | 256 | match <- | 
|  | 257 | str_match( | 
|  | 258 | snippet, | 
|  | 259 | '<span class="context-left">(<span class="more"></span>)?(.*[^ ]) *</span><span class="match"><mark>.*</mark></span><span class="context-right"> *([^<]*)' | 
|  | 260 | ) | 
|  | 261 |  | 
|  | 262 | left <- if(leftContextSize > 0) | 
|  | 263 | tail(unlist(str_split(match[1, 3], tokenizeRegex)), leftContextSize) | 
|  | 264 | else | 
|  | 265 | "" | 
|  | 266 | #    cat(paste("left:", left, "\n", collapse=" ")) | 
|  | 267 |  | 
|  | 268 | right <- if(rightContextSize > 0) | 
|  | 269 | head(unlist(str_split(match[1, 4], tokenizeRegex)), rightContextSize) | 
|  | 270 | else | 
|  | 271 | "" | 
|  | 272 | #    cat(paste("right:", right, "\n", collapse=" ")) | 
|  | 273 |  | 
|  | 274 | if(is.na(left) || is.na(right) || length(left) + length(right) == 0) { | 
|  | 275 | oldTable | 
|  | 276 | } else { | 
|  | 277 | table(c(left, right)) %>% | 
|  | 278 | dplyr::as_tibble(.name_repair = "minimal") %>% | 
|  | 279 | dplyr::rename(word = 1, frequency = 2) %>% | 
|  | 280 | dplyr::filter(str_detect(word, '^[:alnum:]+-?[:alnum:]*$')) %>% | 
|  | 281 | dplyr::anti_join(stopwordsTable, by="word")  %>% | 
|  | 282 | dplyr::bind_rows(oldTable) | 
|  | 283 | } | 
|  | 284 | } | 
|  | 285 | } | 
|  | 286 |  | 
|  | 287 | #' Preliminary synsemantic stopwords function | 
|  | 288 | #' | 
|  | 289 | #' @description | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 290 | #' `r lifecycle::badge("experimental")` | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 291 | #' | 
|  | 292 | #' Preliminary synsemantic stopwords function to be used in collocation analysis. | 
|  | 293 | #' | 
|  | 294 | #' @details | 
|  | 295 | #' Currently only suitable for German. See stopwords package for other languages. | 
|  | 296 | #' | 
|  | 297 | #' @param ... future arguments for language detection | 
|  | 298 | #' | 
|  | 299 | #' @family collocation analysis functions | 
|  | 300 | #' @return Vector of synsemantic stopwords. | 
|  | 301 | #' @export | 
|  | 302 | synsemanticStopwords <- function(...) { | 
|  | 303 | res <- c( | 
|  | 304 | "der", | 
|  | 305 | "die", | 
|  | 306 | "und", | 
|  | 307 | "in", | 
|  | 308 | "den", | 
|  | 309 | "von", | 
|  | 310 | "mit", | 
|  | 311 | "das", | 
|  | 312 | "zu", | 
|  | 313 | "im", | 
|  | 314 | "ist", | 
|  | 315 | "auf", | 
|  | 316 | "sich", | 
|  | 317 | "Die", | 
|  | 318 | "des", | 
|  | 319 | "dem", | 
|  | 320 | "nicht", | 
|  | 321 | "ein", | 
| Marc Kupietz | d2c08cb | 2021-12-07 10:28:21 +0100 | [diff] [blame] | 322 | "Ein", | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 323 | "eine", | 
| Marc Kupietz | d2c08cb | 2021-12-07 10:28:21 +0100 | [diff] [blame] | 324 | "Eine", | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 325 | "es", | 
|  | 326 | "auch", | 
|  | 327 | "an", | 
|  | 328 | "als", | 
|  | 329 | "am", | 
|  | 330 | "aus", | 
|  | 331 | "Der", | 
|  | 332 | "bei", | 
|  | 333 | "er", | 
|  | 334 | "dass", | 
|  | 335 | "sie", | 
|  | 336 | "nach", | 
|  | 337 | "um", | 
|  | 338 | "Das", | 
|  | 339 | "zum", | 
|  | 340 | "noch", | 
|  | 341 | "war", | 
|  | 342 | "einen", | 
|  | 343 | "einer", | 
|  | 344 | "wie", | 
|  | 345 | "einem", | 
|  | 346 | "vor", | 
|  | 347 | "bis", | 
|  | 348 | "\u00fcber", | 
|  | 349 | "so", | 
|  | 350 | "aber", | 
|  | 351 | "Eine", | 
|  | 352 | "diese", | 
|  | 353 | "Diese", | 
|  | 354 | "oder" | 
|  | 355 | ) | 
|  | 356 | return(res) | 
|  | 357 | } | 
|  | 358 |  | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 359 |  | 
|  | 360 | #' @export | 
|  | 361 | findExample <- | 
|  | 362 | function(kco, | 
|  | 363 | query, | 
|  | 364 | vc = "", | 
|  | 365 | matchOnly = TRUE) { | 
|  | 366 | out <- character(length = length(query)) | 
|  | 367 |  | 
|  | 368 | if (length(vc) < length(query)) | 
|  | 369 | vc <- rep(vc, length(query)) | 
|  | 370 |  | 
|  | 371 | for (i in seq_along(query)) { | 
|  | 372 | q <- corpusQuery(kco, paste0("(", query[i], ")"), vc = vc[i], metadataOnly = FALSE) | 
| Marc Kupietz | b811ffb | 2021-12-07 10:34:10 +0100 | [diff] [blame^] | 373 | if (q@totalResults > 0) { | 
|  | 374 | q <- fetchNext(q, maxFetch=50, randomizePageOrder=F) | 
|  | 375 | example <- as.character((q@collectedMatches)$snippet[1]) | 
|  | 376 | out[i] <- if(matchOnly) { | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 377 | gsub('.*<mark>(.+)</mark>.*', '\\1', example) | 
|  | 378 | } else { | 
|  | 379 | stringr::str_replace(example, '<[^>]*>', '') | 
|  | 380 | } | 
| Marc Kupietz | b811ffb | 2021-12-07 10:34:10 +0100 | [diff] [blame^] | 381 | } else { | 
|  | 382 | out[i] = "" | 
|  | 383 | } | 
| Marc Kupietz | 5a336b6 | 2021-11-27 17:51:35 +0100 | [diff] [blame] | 384 | } | 
|  | 385 | out | 
|  | 386 | } | 
|  | 387 |  | 
| Marc Kupietz | dbd431a | 2021-08-29 12:17:45 +0200 | [diff] [blame] | 388 | collocatesQuery <- | 
|  | 389 | function(kco, | 
|  | 390 | query, | 
|  | 391 | vc = "", | 
|  | 392 | minOccur = 5, | 
|  | 393 | leftContextSize = 5, | 
|  | 394 | rightContextSize = 5, | 
|  | 395 | searchHitsSampleLimit = 20000, | 
|  | 396 | ignoreCollocateCase = FALSE, | 
|  | 397 | stopwords = c(), | 
|  | 398 | ...) { | 
|  | 399 | frequency <- NULL | 
|  | 400 | q <- corpusQuery(kco, query, vc, metadataOnly = F, ...) | 
|  | 401 | if(q@totalResults == 0) { | 
|  | 402 | tibble(word=c(), frequency=c()) | 
|  | 403 | } else { | 
|  | 404 | q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE) | 
|  | 405 | snippet2FreqTable((q@collectedMatches)$snippet, | 
|  | 406 | minOccur = minOccur, | 
|  | 407 | leftContextSize = leftContextSize, | 
|  | 408 | rightContextSize = rightContextSize, | 
|  | 409 | ignoreCollocateCase = ignoreCollocateCase, | 
|  | 410 | stopwords = stopwords, | 
|  | 411 | verbose = kco@verbose) %>% | 
|  | 412 | mutate(frequency = frequency * q@totalResults / min(q@totalResults, searchHitsSampleLimit)) %>% | 
|  | 413 | filter(frequency >= minOccur) | 
|  | 414 | } | 
|  | 415 | } |