Vectorize corpusQuery and corpusStats
Change-Id: If2deeeeef2b2d64169dd21e5514dac6f8e458b32
diff --git a/R/KorAPConnection.R b/R/KorAPConnection.R
index fa8f1f5..85d8997 100644
--- a/R/KorAPConnection.R
+++ b/R/KorAPConnection.R
@@ -9,7 +9,6 @@
#' @import dplyr
#' @import purrr
#' @import tidyr
-#'
#' @export
KorAPConnection <- setClass("KorAPConnection", slots=c(KorAPUrl="character", apiVersion="character", apiUrl="character", userAgent="character", timeout="numeric", verbose="logical", cache="logical"))
@@ -61,6 +60,9 @@
setGeneric("apiCall", function(kco, ...) standardGeneric("apiCall") )
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+
#' @aliases apiCall
#' @rdname KorAPConnection-class
#' @param kco KorAPConnection object
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 7767adf..3513d5d 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -13,12 +13,16 @@
#' @slot paragraphs number of paragraphs
setClass("KorAPCorpusStats", slots=c(vc="character", documents="numeric", tokens="numeric", sentences="numeric", paragraphs="numeric"))
+log.info <- function(v, ...) {
+ cat(ifelse(v, paste0(...), ""))
+}
setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats") )
#' Fetch information about a (virtual) corpus
#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
#' @param vc string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
#' @param verbose logical. If \code{TRUE}, additional diagnostics are printed.
+#' @param as.df return result as data frame instead of as S4 object?
#' @return \code{KorAPCorpusStats} object with the slots \code{documents}, \code{tokens}, \code{sentences}, \code{paragraphs}
#'
#' @examples
@@ -29,16 +33,24 @@
#'
#' @aliases corpusStats
#' @export
-setMethod("corpusStats", "KorAPConnection", function(kco, vc="", verbose = kco@verbose) {
- url <- paste0(kco@apiUrl, 'statistics?cq=', URLencode(vc, reserved=TRUE))
- if (verbose) {
- cat("Calculating size of corpus \"", vc,"\"", sep="")
- }
- res <- apiCall(kco, url)
- if (verbose) {
- cat("\n")
- }
- new("KorAPCorpusStats", vc = vc, documents = res$documents, tokens = res$tokens, sentences = res$sentences, paragraphs = res$paragraphs)
+setMethod("corpusStats", "KorAPConnection", function(kco, vc="", verbose = kco@verbose, as.df = FALSE) {
+ ifelse(length(vc) > 1,
+ return(
+ do.call(rbind,
+ Map(function(cq) corpusStats(kco, cq, verbose, as.df = TRUE), vc))
+ ), {
+ url <- paste0(kco@apiUrl, 'statistics?cq=', URLencode(vc, reserved=TRUE))
+ log.info(verbose, "Calculating size of corpus \"", vc,"\"", sep="")
+ res <- apiCall(kco, url)
+ log.info(verbose, "\n")
+ ifelse(as.df,
+ return(data.frame(vc=vc, res, stringsAsFactors = FALSE)),
+ return(new("KorAPCorpusStats", vc = vc, documents = res$documents,
+ tokens = res$tokens,
+ sentences = res$sentences,
+ paragraphs = res$paragraphs))
+ )
+ })
})
#' @rdname KorAPCorpusStats-class
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 3cc0b37..f71a4f3 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -4,11 +4,12 @@
#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
#'
#' @include KorAPConnection.R
+#' @import jsonlite
#' @import tidyr
#' @import dplyr
#' @import httr
#'
-#'
+#' @include RKorAPClient.R
#' @export
KorAPQuery <- setClass("KorAPQuery", slots = c(
@@ -76,9 +77,8 @@
}
}
-KorAPQueryStringFromUrl <- function(KorAPUrl) {
- return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
-}
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
#' Method corpusQuery
#'
@@ -93,6 +93,7 @@
#' @param fields (meta)data fields that will be fetched for every match.
#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).
#' @param verbose print some info
+#' @param as.df return result as data frame instead of as S4 object?
#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
@@ -115,12 +116,12 @@
#' fetchAll() %>%
#' slot("collectedMatches") %>%
#' mutate(year = lubridate::year(pubDate)) %>%
-#' dplyr::select(year) %>%
+#' select(year) %>%
#' group_by(year) %>%
#' summarise(Count = n()) %>%
#' mutate(Freq = mapply(function(f, y)
#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
-#' dplyr::select(-Count) %>%
+#' select(-Count) %>%
#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
#' plot(type = "l")
#'
@@ -132,47 +133,55 @@
#' @aliases corpusQuery
#' @export
setMethod("corpusQuery", "KorAPConnection",
- function(kco, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
- accessRewriteFatal = TRUE, verbose = kco@verbose) {
- defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
- "availability", "textClass", "snippet")
- contentFields <- c("snippet")
- fields <- fields[!fields %in% contentFields]
-
- if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
- stop("Exactly one of the parameters query and KorAPUrl must be specified.")
- }
- if (missing(query)) {
- query <- QueryParameterFromUrl(KorAPUrl, "q")
- vc <- QueryParameterFromUrl(KorAPUrl, "cq")
- ql <- QueryParameterFromUrl(KorAPUrl, "ql")
- }
- request <- paste0('?q=', URLencode(query, reserved=TRUE),
- ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
- webUIRequestUrl <- paste0(kco@KorAPUrl, request)
- requestUrl <- paste0(kco@apiUrl, 'search', request,
- '&fields=', paste(fields, collapse = ","),
- ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
- if (verbose) {
- cat("Searching \"", query, "\" in \"", vc, "\"", sep="")
- }
- res = apiCall(kco, paste0(requestUrl, '&count=0'))
- if (verbose) {
- cat(" took ", res$meta$benchmark, "\n", sep="")
- }
- KorAPQuery(
- korapConnection = kco,
- nextStartIndex = 0,
- fields = fields,
- requestUrl = requestUrl,
- request = request,
- totalResults = res$meta$totalResults,
- vc = vc,
- apiResponse = res,
- webUIRequestUrl = webUIRequestUrl,
- hasMoreMatches = (res$meta$totalResults > 0),
- )
- })
+ function(kco,
+ query = ifelse(missing(KorAPUrl),
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE),
+ httr::parse_url(KorAPUrl)$query$q),
+ vc = ifelse(missing(KorAPUrl), "", httr::parse_url(KorAPUrl)$query$cq),
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = ifelse(missing(KorAPUrl), "poliqarp", httr::parse_url(KorAPUrl)$query$ql),
+ fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace",
+ "availability", "textClass", "snippet"),
+ accessRewriteFatal = TRUE,
+ verbose = kco@verbose,
+ as.df = FALSE) {
+ ifelse(length(query) > 1 , {
+ #grid <- expand_grid(query=query, vc=vc)
+ return(
+ do.call(rbind,
+ Map(function(q, cq) corpusQuery(kco, query=q, vc=cq,
+ verbose=verbose, as.df = TRUE), query, vc))
+ )}, {
+ contentFields <- c("snippet")
+ fields <- fields[!fields %in% contentFields]
+ request <- paste0('?q=', URLencode(query, reserved=TRUE),
+ ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(kco@apiUrl, 'search', request,
+ '&fields=', paste(fields, collapse = ","),
+ ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
+ log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep="")
+ res = apiCall(kco, paste0(requestUrl, '&count=0'))
+ log.info(verbose, " took ", res$meta$benchmark, "\n", sep="")
+ ifelse(as.df,
+ return(data.frame(query=query,
+ totalResults=res$meta$totalResults,
+ vc=vc,
+ webUIRequestUrl=webUIRequestUrl, stringsAsFactors = FALSE)),
+ return(KorAPQuery(
+ korapConnection = kco,
+ nextStartIndex = 0,
+ fields = fields,
+ requestUrl = requestUrl,
+ request = request,
+ totalResults = res$meta$totalResults,
+ vc = vc,
+ apiResponse = res,
+ webUIRequestUrl = webUIRequestUrl,
+ hasMoreMatches = (res$meta$totalResults > 0),
+ )))})
+ })
#' Fetch the next bunch of results of a KorAP query.
#'
diff --git a/R/reexports.R b/R/reexports.R
index f6624ba..1a4d729 100644
--- a/R/reexports.R
+++ b/R/reexports.R
@@ -10,6 +10,9 @@
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL
+#' @importFrom dplyr bind_cols
+#' @export
+dplyr::bind_cols
#' @importFrom dplyr mutate
#' @export
dplyr::mutate
@@ -25,6 +28,18 @@
#' @importFrom tidyr complete
#' @export
tidyr::complete
+#' @importFrom tidyr expand_grid
+#' @export
+tidyr::expand_grid
#' @importFrom lubridate year
#' @export
lubridate::year
+#' @importFrom tibble as_tibble rownames_to_column
+#' @export
+tibble::as_tibble
+tibble::rownames_to_column
+#' @importFrom broom tidy
+#' @export
+broom::tidy
+#' @importFrom ggplot2 ggplot geom_bar geom_line
+#' @export