Add full collocation analysis (client side only implementation)
Resolves #2
Change-Id: Ib01d89a72b44ff06816b21532b7ea709a4e837b0
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 0b07106..5ef4846 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -66,15 +66,23 @@
setGeneric("fetchNext", function(kqo, ...) standardGeneric("fetchNext") )
setGeneric("fetchRest", function(kqo, ...) standardGeneric("fetchRest") )
setGeneric("frequencyQuery", function(kco, ...) standardGeneric("frequencyQuery") )
-setGeneric("collocationScoreQuery", function(kco, ...) standardGeneric("collocationScoreQuery") )
maxResultsPerPage <- 50
## quiets concerns of R CMD check re: the .'s that appear in pipelines
if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+#' Corpus query
+#'
#' \bold{\code{corpusQuery}} performs a corpus query via a connection to a KorAP-API-server
#'
+#' @rdname KorAPQuery-class
+#' @aliases corpusQuery
+#'
+#' @importFrom urltools url_encode
+#' @importFrom purrr pmap
+#' @importFrom dplyr bind_rows
+#'
#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
#' @param query string that contains the corpus query. The query language depends on the \code{ql} parameter. Either \code{query} must be provided or \code{KorAPUrl}.
#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
@@ -125,7 +133,6 @@
#' @references
#' \url{https://ids-pub.bsz-bw.de/frontdoor/index/index/docId/9026}
#'
-#' @aliases corpusQuery
#' @export
setMethod("corpusQuery", "KorAPConnection",
function(kco,
@@ -151,25 +158,19 @@
expand = length(vc) != length(query),
as.df = FALSE) {
if (length(query) > 1 || length(vc) > 1) {
-
- grid <- {
- if (expand)
- expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc) }
- return(
- do.call(rbind,
- Map(function(q, cq) corpusQuery(kco, query=q, vc=cq, ql=ql,
- verbose=verbose, as.df = TRUE), grid$query, grid$vc)) %>%
- remove_rownames()
- )
- } else {
+ grid <- if (expand) expand_grid(query=query, vc=vc) else tibble(query=query, vc=vc)
+ purrr::pmap(grid, function(query, vc, ...)
+ corpusQuery(kco, query=query, vc=vc, ql=ql, verbose=verbose, as.df = TRUE)) %>%
+ bind_rows()
+ } else {
contentFields <- c("snippet")
if (metadataOnly) {
fields <- fields[!fields %in% contentFields]
}
request <-
paste0('?q=',
- URLencode(enc2utf8(query), reserved = TRUE),
- if (vc != '') paste0('&cq=', URLencode(enc2utf8(vc), reserved = TRUE)) else '', '&ql=', ql)
+ url_encode(enc2utf8(query)),
+ ifelse (vc != '', paste0('&cq=', url_encode(enc2utf8(vc))), ''), '&ql=', ql)
webUIRequestUrl <- paste0(kco@KorAPUrl, request)
requestUrl <- paste0(
kco@apiUrl,
@@ -182,10 +183,11 @@
log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep =
"")
res = apiCall(kco, paste0(requestUrl, '&count=0'))
+ log.info(verbose, ": ", res$meta$totalResults, " hits")
if(!is.null(res$meta$cached))
log.info(verbose, " [cached]\n")
else
- log.info(verbose, " took ", res$meta$benchmark, "\n", sep = "")
+ log.info(verbose, ", took ", res$meta$benchmark, "\n", sep = "")
if (as.df)
data.frame(
query = query,
@@ -218,6 +220,7 @@
#' @param offset start offset for query results to fetch
#' @param maxFetch maximum number of query results to fetch
#' @param verbose print progress information if true
+#' @param randomizePageOrder fetch result pages in pseudo random order if true. Use \code{\link{set.seed}} to set seed for reproducible results.
#' @return The \code{kqo} input object with updated slots \code{collectedMatches}, \code{apiResponse}, \code{nextStartIndex}, \code{hasMoreMatches}
#'
#' @examples
@@ -232,7 +235,11 @@
#' @rdname KorAPQuery-class
#' @importFrom dplyr rowwise bind_rows select summarise n
#' @export
-setMethod("fetchNext", "KorAPQuery", function(kqo, offset = kqo@nextStartIndex, maxFetch = maxResultsPerPage, verbose = kqo@korapConnection@verbose) {
+setMethod("fetchNext", "KorAPQuery", function(kqo,
+ offset = kqo@nextStartIndex,
+ maxFetch = maxResultsPerPage,
+ verbose = kqo@korapConnection@verbose,
+ randomizePageOrder = FALSE) {
if (kqo@totalResults == 0 || offset >= kqo@totalResults) {
return(kqo)
}
@@ -242,8 +249,14 @@
pubDate <- NULL # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
collectedMatches <- kqo@collectedMatches
+ if (randomizePageOrder) {
+ pages <- head(sample.int(ceiling(kqo@totalResults / maxResultsPerPage)), maxFetch) - 1
+ }
+
repeat {
- query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', offset + results, '&cutoff=true')
+ page = length(collectedMatches[,1]) %/% maxResultsPerPage + 1
+ currentOffset = ifelse(randomizePageOrder, pages[page], page - 1) * maxResultsPerPage
+ query <- paste0(kqo@requestUrl, '&count=', min(if (!is.na(maxFetch)) maxFetch - results else maxResultsPerPage, maxResultsPerPage) ,'&offset=', currentOffset, '&cutoff=true')
res <- apiCall(kqo@korapConnection, query)
if (length(res$matches) == 0) {
break
@@ -270,11 +283,22 @@
collectedMatches <- rbind(collectedMatches, currentMatches)
}
if (verbose) {
- cat(paste0("Retrieved page ", page, "/", ceiling((kqo@totalResults) / res$meta$itemsPerPage), ' in ', res$meta$benchmark, '\n'))
+ cat(paste0(
+ "Retrieved page ",
+ ceiling(length(collectedMatches[, 1]) / res$meta$itemsPerPage),
+ "/",
+ if (!is.na(maxFetch) && maxFetch < kqo@totalResults)
+ sprintf("%d (%d)", ceiling(maxFetch / res$meta$itemsPerPage), ceiling(kqo@totalResults / res$meta$itemsPerPage))
+ else
+ sprintf("%d", ceiling(kqo@totalResults / res$meta$itemsPerPage)),
+ ' in ',
+ res$meta$benchmark,
+ '\n'
+ ))
}
page <- page + 1
results <- results + res$meta$itemsPerPage
- if (offset + results >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
+ if (length(collectedMatches[,1]) >= kqo@totalResults || (!is.na(maxFetch) && results >= maxFetch)) {
break
}
}
@@ -305,8 +329,8 @@
#' @aliases fetchAll
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose))
+setMethod("fetchAll", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, offset = 0, maxFetch = NA, verbose = verbose, ...))
})
#' Fetches the remaining results of a KorAP query.
@@ -320,8 +344,8 @@
#' @aliases fetchRest
#' @rdname KorAPQuery-class
#' @export
-setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose) {
- return(fetchNext(kqo, maxFetch = NA, verbose = verbose))
+setMethod("fetchRest", "KorAPQuery", function(kqo, verbose = kqo@korapConnection@verbose, ...) {
+ return(fetchNext(kqo, maxFetch = NA, verbose = verbose, ...))
})
#' Query relative frequency of search term(s)
@@ -357,6 +381,56 @@
ci(conf.level = conf.level)
})
+
+#' buildWebUIRequestUrl
+#'
+#' @rdname KorAPQuery-class
+#' @importFrom urltools url_encode
+#' @export
+buildWebUIRequestUrl <- function(kco,
+ query = if (missing(KorAPUrl))
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE)
+ else
+ httr::parse_url(KorAPUrl)$query$q,
+ vc = if (missing(KorAPUrl)) "" else httr::parse_url(KorAPUrl)$query$cq,
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = if (missing(KorAPUrl)) "poliqarp" else httr::parse_url(KorAPUrl)$query$ql,
+ fields = c(
+ "corpusSigle",
+ "textSigle",
+ "pubDate",
+ "pubPlace",
+ "availability",
+ "textClass",
+ "snippet"
+ ),
+ accessRewriteFatal = TRUE) {
+ request <-
+ paste0(
+ '?q=',
+ urltools::url_encode(enc2utf8(as.character(query))),
+ ifelse(vc != '',
+ paste0('&cq=', urltools::url_encode(enc2utf8(vc))),
+ ''),
+ '&ql=',
+ ql
+ )
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(
+ kco@apiUrl,
+ 'search',
+ request,
+ '&fields=',
+ paste(fields, collapse = ","),
+ if (metadataOnly)
+ '&access-rewrite-disabled=true'
+ else
+ ''
+ )
+ webUIRequestUrl
+}
+
#´ format()
#' @rdname KorAPQuery-class
#' @param x KorAPQuery object
@@ -388,118 +462,3 @@
format(object)
})
-
-
-lemmatizeWordQuery <- function(w) {
- paste0('[tt/l=', w, ']')
-}
-
-#' Query frequencies of a node and a collocate and calculate collocation association scores
-#'
-#' \bold{\code{collocationScoreQuery}} computes various collocation association scores
-#' based on \code{\link{frequencyQuery}}s for a target word and a collocate.
-#'
-#' @aliases collocationScoreQuery
-#' @rdname KorAPQuery-class
-#'
-#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
-#' @param node target word
-#' @param collocate collocate of target word
-#' @param vc string describing the virtual corpus in which the query should be performed. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
-#' @param lemmatizeNodeQuery logical, set to TRUE if node query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param lemmatizeCollocateQuery logical, set to TRUE if collocate query should be lemmatized, i.e. x -> [tt/l=x]
-#' @param leftContextSize size of the left context window
-#' @param rightContextSize size of the right context window
-#' @param scoreFunctions named list of score functions of the form function(O1, O2, O, N, E, window_size), see e.g. \link{pmi}
-#' @param smoothingConstant smoothing constant will be added to all observed values
-#'
-#' @return tibble with query KorAP web request URL, all observed values and association scores
-#'
-#' @examples
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", "triftiger")
-#' }
-#'
-#' \donttest{
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Grund", c("guter", "triftiger"),
-#' scoreFunctions = list(localMI = function(O1, O2, O, N, E, window_size) { O * log2(O/E) }) )
-#' }
-#'
-#' \donttest{
-#' library(highcharter)
-#' library(tidyr)
-#' new("KorAPConnection", verbose = TRUE) %>%
-#' collocationScoreQuery("Team", "agil", vc = paste("pubDate in", c(2014:2018)),
-#' lemmatizeNodeQuery = TRUE, lemmatizeCollocateQuery = TRUE) %>%
-#' pivot_longer(14:last_col(), names_to = "measure", values_to = "score") %>%
-#' hchart(type="spline", hcaes(label, score, group=measure)) %>%
-#' hc_add_onclick_korap_search()
-#' }
-#'
-#' @importFrom tidyr pivot_longer
-#' @export
-setMethod("collocationScoreQuery", "KorAPConnection",
- function(kco,
- node,
- collocate,
- vc = "",
- lemmatizeNodeQuery = FALSE,
- lemmatizeCollocateQuery = FALSE,
- leftContextSize = 5,
- rightContextSize = 5,
- scoreFunctions = defaultAssociationScoreFunctions(),
- smoothingConstant = .5
- ) {
- # https://stackoverflow.com/questions/8096313/no-visible-binding-for-global-variable-note-in-r-cmd-check
- O1 <- O2 <- O <- N <- E <- w <- 0
-
- if (leftContextSize <= 0 && rightContextSize <= 0) {
- stop("At least one of leftContextSize and rightContextSize must be > 0",
- call. = FALSE)
- }
-
- if (lemmatizeNodeQuery) {
- node <- lemmatizeWordQuery(node)
- }
-
- if (lemmatizeCollocateQuery) {
- collocate <- lemmatizeWordQuery(collocate)
- }
-
- query <- ""
-
- if (leftContextSize > 0) {
- query <-
- paste0(collocate,
- if (leftContextSize > 1) paste0(" []{0,", leftContextSize - 1, "} ") else " ",
- node,
- if (rightContextSize > 0) " | ")
- }
-
- if (rightContextSize > 0) {
- query <-
- paste0(query, node,
- if (rightContextSize > 1) paste0(" []{0,", rightContextSize - 1, "} ") else " ", collocate)
- }
-
-
- tibble(
- node = node,
- collocate = collocate,
- label = queryStringToLabel(vc),
- vc = vc,
- webUIRequestUrl = frequencyQuery(kco, query, vc)$webUIRequestUrl,
- w = leftContextSize + rightContextSize,
- leftContextSize,
- rightContextSize,
- N = frequencyQuery(kco, node, vc)$total + smoothingConstant,
- O = as.double(frequencyQuery(kco, query, vc)$totalResults) + smoothingConstant,
- O1 = frequencyQuery(kco, node, vc)$totalResults + smoothingConstant,
- O2 = frequencyQuery(kco, collocate, vc)$totalResults + smoothingConstant,
- E = w * as.double(O1) * O2 / N
- ) %>%
- mutate(!!! lapply(scoreFunctions, mapply, .$O1, .$O2, .$O, .$N, .$E, .$w))
-
- })