CA: use server side tokenized matches if supported Change-Id: I0a7f1ce798a8e6ed1c8c68e50698c3ea2486ecbc

commit: 2b17b21d4222b1017818680d28d0dfc6af1ed6db [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Aug 27 17:47:26 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Aug 28 16:31:04 2023 +0200
tree: 8ac29b24d2bb2dd89c70f91b6ac9bb6b21d7fe0a
parent: 2078bde2a43000211637e45b1a37fb7ea2e2c7ac [diff]
diff --git a/NEWS.md b/NEWS.md
index 83b6cef..50afcf8 100644
--- a/NEWS.md
+++ b/NEWS.md

@@ -1,6 +1,7 @@
 # RKorAPClient 0.7.7.9000 (unpublished)
 
 - if not `metadataOnly`, also retrieve tokenized snippets (in `collectedMatches$tokens`)
+- uses server side tokenized matches in collocation analysis, if supported by KorAP server
 
 # RKorAPClient 0.7.7
 

diff --git a/R/collocationAnalysis.R b/R/collocationAnalysis.R
index fc9d814..64e0c83 100644
--- a/R/collocationAnalysis.R
+++ b/R/collocationAnalysis.R

@@ -44,8 +44,7 @@
 #' @inheritParams collocationScoreQuery,KorAPConnection-method
 #' @return Tibble with top collocates, association scores, corresponding URLs for web user interface queries, etc.
 #'
-#' @importFrom stringr str_match str_split str_detect
-#' @importFrom dplyr anti_join arrange desc slice_head bind_rows
+#' @importFrom dplyr arrange desc slice_head bind_rows
 #' @importFrom purrr pmap
 #' @importFrom tidyr expand_grid
 #'
@@ -224,6 +223,74 @@
 }
 
 #' @importFrom magrittr debug_pipe
+#' @importFrom stringr str_detect
+#' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
+#'
+matches2FreqTable <- function(matches,
+                              index = 0,
+                              minOccur = 5,
+                              leftContextSize = 5,
+                              rightContextSize = 5,
+                              ignoreCollocateCase = FALSE,
+                              stopwords = c(),
+                              collocateFilterRegex =  '^[:alnum:]+-?[:alnum:]*$',
+                              oldTable = data.frame(word = rep(NA, 1), frequency = rep(NA, 1)),
+                              verbose = TRUE) {
+  word <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
+  frequency <- NULL
+
+  if (nrow(matches) < 1) {
+    dplyr::tibble(word=c(), frequency=c())
+  } else if (index == 0) {
+    if (! "tokens" %in% colnames(matches) || ! is.list(matches$tokens)) {
+      log_info(verbose, "Outdated KorAP server: Falling back to client side tokenization.\n")
+      return(snippet2FreqTable(matches$snippet, minOccur, leftContextSize, rightContextSize, ignoreCollocateCase = ignoreCollocateCase,
+                        stopwords = stopwords, oldTable = oldTable, verbose = verbose))
+
+    }
+    log_info(verbose, paste("Joining", nrow(matches), "kwics\n"))
+    for (i in 1:nrow(matches)) {
+      oldTable <- matches2FreqTable(
+        matches,
+        i,
+        leftContextSize = leftContextSize,
+        rightContextSize = rightContextSize,
+        collocateFilterRegex = collocateFilterRegex,
+        oldTable = oldTable,
+        stopwords = stopwords
+      )
+    }
+    log_info(verbose, paste("Aggregating", length(oldTable$word), "tokens\n"))
+    oldTable  %>%
+      group_by(word) %>%
+      mutate(word = dplyr::case_when(ignoreCollocateCase ~ tolower(word), TRUE ~ word)) %>%
+      summarise(frequency=sum(frequency), .groups = "drop") %>%
+      arrange(desc(frequency))
+  } else {
+    stopwordsTable <- dplyr::tibble(word=stopwords)
+
+    left <- tail(unlist(matches$tokens$left[index]), leftContextSize)
+
+#    cat(paste("left:", left, "\n", collapse=" "))
+
+    right <- head(unlist(matches$tokens$right[index]), rightContextSize)
+
+#    cat(paste("right:", right, "\n", collapse=" "))
+
+    if(length(left) + length(right) == 0) {
+      oldTable
+    } else {
+      table(c(left, right)) %>%
+        dplyr::as_tibble(.name_repair = "minimal") %>%
+        dplyr::rename(word = 1, frequency = 2) %>%
+        dplyr::filter(str_detect(word, collocateFilterRegex)) %>%
+        dplyr::anti_join(stopwordsTable, by="word")  %>%
+        dplyr::bind_rows(oldTable)
+    }
+  }
+}
+
+#' @importFrom magrittr debug_pipe
 #' @importFrom stringr str_match str_split str_detect
 #' @importFrom dplyr as_tibble tibble rename filter anti_join tibble bind_rows case_when
 #'
@@ -411,7 +478,8 @@
       tibble(word=c(), frequency=c())
     } else {
       q <- fetchNext(q, maxFetch=searchHitsSampleLimit, randomizePageOrder=TRUE)
-      snippet2FreqTable((q@collectedMatches)$snippet,
+      matches2FreqTable (q@collectedMatches,
+                         0,
                         minOccur = minOccur,
                         leftContextSize = leftContextSize,
                         rightContextSize = rightContextSize,
commit	2b17b21d4222b1017818680d28d0dfc6af1ed6db	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Aug 27 17:47:26 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Aug 28 16:31:04 2023 +0200
tree	8ac29b24d2bb2dd89c70f91b6ac9bb6b21d7fe0a
parent	2078bde2a43000211637e45b1a37fb7ea2e2c7ac [diff]