CA: use server side tokenized matches if supported
Change-Id: I0a7f1ce798a8e6ed1c8c68e50698c3ea2486ecbc
diff --git a/NEWS.md b/NEWS.md
index 83b6cef..50afcf8 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,6 +1,7 @@
# RKorAPClient 0.7.7.9000 (unpublished)
- if not `metadataOnly`, also retrieve tokenized snippets (in `collectedMatches$tokens`)
+- uses server side tokenized matches in collocation analysis, if supported by KorAP server
# RKorAPClient 0.7.7
diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index fc9d814..64e0c83 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R
@@ -44,8 +44,7 @@
#' @inheritParams collocationScoreQuery,KorAPConnection-method
#' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
#'
-#' @importFrom stringr str_match str_split str_detect
-#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom dplyr arrange desc slice_head bind_rows
#' @importFrom purrr pmap
#' @importFrom tidyr expand_grid
#'
@@ -224,6 +223,74 @@
}
#' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+matches2FreqTable <- function(matches,
+ index = 0,
+ minOccur = 5,
+ leftContextSize = 5,
+ rightContextSize = 5,
+ ignoreCollocateCase = FALSE,
+ stopwords = c(),
+ collocateFilterRegex = '^[:alnum:]+-?[:alnum:]*$',
+ oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+ verbose = TRUE) {
+ word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+ frequency <- NULL
+
+ if (nrow(matches) < 1) {
+ dplyr::tibble(word=c(), frequency=c())
+ } else if (index == 0) {
+ if (! "tokens" %in% colnames(matches) || ! is.list(matches$tokens)) {
+ log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
+ return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize, ignoreCollocateCase = ignoreCollocateCase,
+ stopwords = stopwords, oldTable = oldTable, verbose = verbose))
+
+ }
+ log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
+ for (i in 1:nrow(matches)) {
+ oldTable <- matches2FreqTable(
+ matches,
+ i,
+ leftContextSize = leftContextSize,
+ rightContextSize = rightContextSize,
+ collocateFilterRegex = collocateFilterRegex,
+ oldTable = oldTable,
+ stopwords = stopwords
+ )
+ }
+ log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
+ oldTable %>%
+ group_by(word) %>%
+ mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+ summarise(frequency=sum(frequency), .groups = "drop") %>%
+ arrange(desc(frequency))
+ } else {
+ stopwordsTable <- dplyr::tibble(word=stopwords)
+
+ left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
+
+# cat(paste("left:", left, "\n", collapse=" "))
+
+ right <- head(unlist(matches$tokens$right[index]), rightContextSize)
+
+# cat(paste("right:", right, "\n", collapse=" "))
+
+ if(length(left) + length(right) == 0) {
+ oldTable
+ } else {
+ table(c(left, right)) %>%
+ dplyr::as_tibble(.name_repair = "minimal") %>%
+ dplyr::rename(word = 1, frequency = 2) %>%
+ dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
+ dplyr::anti_join(stopwordsTable, by="word") %>%
+ dplyr::bind_rows(oldTable)
+ }
+ }
+}
+
+#' @importFrom magrittr debug_pipe
#' @importFrom stringr str_match str_split str_detect
#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
#'
@@ -411,7 +478,8 @@
tibble(word=c(), frequency=c())
} else {
q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
- snippet2FreqTable((q@collectedMatches)$snippet,
+ matches2FreqTable (q@collectedMatches,
+ 0,
minOccur = minOccur,
leftContextSize = leftContextSize,
rightContextSize = rightContextSize,