Vectorize corpusQuery and corpusStats
Change-Id: If2deeeeef2b2d64169dd21e5514dac6f8e458b32
diff --git a/NAMESPACE b/NAMESPACE
index 3488a72..a34e671 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,6 +6,7 @@
export(bind_cols)
export(ci)
export(complete)
+export(expand_grid)
export(group_by)
export(mutate)
export(select)
@@ -27,11 +28,13 @@
import(R.cache)
import(dplyr)
import(httr)
+import(jsonlite)
import(methods)
import(purrr)
import(tidyr)
import(utils)
importFrom(broom,tidy)
+importFrom(dplyr,bind_cols)
importFrom(dplyr,group_by)
importFrom(dplyr,mutate)
importFrom(dplyr,select)
@@ -40,4 +43,7 @@
importFrom(lubridate,year)
importFrom(magrittr,"%>%")
importFrom(stats,prop.test)
+importFrom(tibble,as_tibble)
+importFrom(tibble,rownames_to_column)
importFrom(tidyr,complete)
+importFrom(tidyr,expand_grid)
diff --git a/R/KorAPConnection.R b/R/KorAPConnection.R
index fa8f1f5..85d8997 100644
--- a/R/KorAPConnection.R
+++ b/R/KorAPConnection.R
@@ -9,7 +9,6 @@
#' @import dplyr
#' @import purrr
#' @import tidyr
-#'
#' @export
KorAPConnection <- setClass("KorAPConnection", slots=c(KorAPUrl="character", apiVersion="character", apiUrl="character", userAgent="character", timeout="numeric", verbose="logical", cache="logical"))
@@ -61,6 +60,9 @@
setGeneric("apiCall", function(kco, ...) standardGeneric("apiCall") )
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
+
#' @aliases apiCall
#' @rdname KorAPConnection-class
#' @param kco KorAPConnection object
diff --git a/R/KorAPCorpusStats.R b/R/KorAPCorpusStats.R
index 7767adf..3513d5d 100644
--- a/R/KorAPCorpusStats.R
+++ b/R/KorAPCorpusStats.R
@@ -13,12 +13,16 @@
#' @slot paragraphs number of paragraphs
setClass("KorAPCorpusStats", slots=c(vc="character", documents="numeric", tokens="numeric", sentences="numeric", paragraphs="numeric"))
+log.info <- function(v, ...) {
+ cat(ifelse(v, paste0(...), ""))
+}
setGeneric("corpusStats", function(kco, ...) standardGeneric("corpusStats") )
#' Fetch information about a (virtual) corpus
#' @param kco \code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}
#' @param vc string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible.
#' @param verbose logical. If \code{TRUE}, additional diagnostics are printed.
+#' @param as.df return result as data frame instead of as S4 object?
#' @return \code{KorAPCorpusStats} object with the slots \code{documents}, \code{tokens}, \code{sentences}, \code{paragraphs}
#'
#' @examples
@@ -29,16 +33,24 @@
#'
#' @aliases corpusStats
#' @export
-setMethod("corpusStats", "KorAPConnection", function(kco, vc="", verbose = kco@verbose) {
- url <- paste0(kco@apiUrl, 'statistics?cq=', URLencode(vc, reserved=TRUE))
- if (verbose) {
- cat("Calculating size of corpus \"", vc,"\"", sep="")
- }
- res <- apiCall(kco, url)
- if (verbose) {
- cat("\n")
- }
- new("KorAPCorpusStats", vc = vc, documents = res$documents, tokens = res$tokens, sentences = res$sentences, paragraphs = res$paragraphs)
+setMethod("corpusStats", "KorAPConnection", function(kco, vc="", verbose = kco@verbose, as.df = FALSE) {
+ ifelse(length(vc) > 1,
+ return(
+ do.call(rbind,
+ Map(function(cq) corpusStats(kco, cq, verbose, as.df = TRUE), vc))
+ ), {
+ url <- paste0(kco@apiUrl, 'statistics?cq=', URLencode(vc, reserved=TRUE))
+ log.info(verbose, "Calculating size of corpus \"", vc,"\"", sep="")
+ res <- apiCall(kco, url)
+ log.info(verbose, "\n")
+ ifelse(as.df,
+ return(data.frame(vc=vc, res, stringsAsFactors = FALSE)),
+ return(new("KorAPCorpusStats", vc = vc, documents = res$documents,
+ tokens = res$tokens,
+ sentences = res$sentences,
+ paragraphs = res$paragraphs))
+ )
+ })
})
#' @rdname KorAPCorpusStats-class
diff --git a/R/KorAPQuery.R b/R/KorAPQuery.R
index 3cc0b37..f71a4f3 100644
--- a/R/KorAPQuery.R
+++ b/R/KorAPQuery.R
@@ -4,11 +4,12 @@
#' New \code{KorAPQuery} objects are typically created by the \code{\link{corpusQuery}} method.
#'
#' @include KorAPConnection.R
+#' @import jsonlite
#' @import tidyr
#' @import dplyr
#' @import httr
#'
-#'
+#' @include RKorAPClient.R
#' @export
KorAPQuery <- setClass("KorAPQuery", slots = c(
@@ -76,9 +77,8 @@
}
}
-KorAPQueryStringFromUrl <- function(KorAPUrl) {
- return(URLdecode(gsub(".*[?&]q=([^&]*).*", '\\1', KorAPUrl, perl = TRUE)))
-}
+## quiets concerns of R CMD check re: the .'s that appear in pipelines
+if(getRversion() >= "2.15.1") utils::globalVariables(c("."))
#' Method corpusQuery
#'
@@ -93,6 +93,7 @@
#' @param fields (meta)data fields that will be fetched for every match.
#' @param accessRewriteFatal abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).
#' @param verbose print some info
+#' @param as.df return result as data frame instead of as S4 object?
#' @return A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
#' A corresponding URL to be used within a web browser is contained in \code{@webUIRequestUrl}
#' Please make sure to check \code{$collection$rewrites} to see if any unforseen access rewrites of the query's virtual corpus had to be performed.
@@ -115,12 +116,12 @@
#' fetchAll() %>%
#' slot("collectedMatches") %>%
#' mutate(year = lubridate::year(pubDate)) %>%
-#' dplyr::select(year) %>%
+#' select(year) %>%
#' group_by(year) %>%
#' summarise(Count = n()) %>%
#' mutate(Freq = mapply(function(f, y)
#' f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) %>%
-#' dplyr::select(-Count) %>%
+#' select(-Count) %>%
#' complete(year = min(year):max(year), fill = list(Freq = 0)) %>%
#' plot(type = "l")
#'
@@ -132,47 +133,55 @@
#' @aliases corpusQuery
#' @export
setMethod("corpusQuery", "KorAPConnection",
- function(kco, query, vc="", KorAPUrl, metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
- accessRewriteFatal = TRUE, verbose = kco@verbose) {
- defaultFields <- c("corpusSigle", "textSigle", "pubDate", "pubPlace",
- "availability", "textClass", "snippet")
- contentFields <- c("snippet")
- fields <- fields[!fields %in% contentFields]
-
- if (missing(query) && missing(KorAPUrl) || ! (missing(query) || missing(KorAPUrl))) {
- stop("Exactly one of the parameters query and KorAPUrl must be specified.")
- }
- if (missing(query)) {
- query <- QueryParameterFromUrl(KorAPUrl, "q")
- vc <- QueryParameterFromUrl(KorAPUrl, "cq")
- ql <- QueryParameterFromUrl(KorAPUrl, "ql")
- }
- request <- paste0('?q=', URLencode(query, reserved=TRUE),
- ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
- webUIRequestUrl <- paste0(kco@KorAPUrl, request)
- requestUrl <- paste0(kco@apiUrl, 'search', request,
- '&fields=', paste(fields, collapse = ","),
- ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
- if (verbose) {
- cat("Searching \"", query, "\" in \"", vc, "\"", sep="")
- }
- res = apiCall(kco, paste0(requestUrl, '&count=0'))
- if (verbose) {
- cat(" took ", res$meta$benchmark, "\n", sep="")
- }
- KorAPQuery(
- korapConnection = kco,
- nextStartIndex = 0,
- fields = fields,
- requestUrl = requestUrl,
- request = request,
- totalResults = res$meta$totalResults,
- vc = vc,
- apiResponse = res,
- webUIRequestUrl = webUIRequestUrl,
- hasMoreMatches = (res$meta$totalResults > 0),
- )
- })
+ function(kco,
+ query = ifelse(missing(KorAPUrl),
+ stop("At least one of the parameters query and KorAPUrl must be specified.", call. = FALSE),
+ httr::parse_url(KorAPUrl)$query$q),
+ vc = ifelse(missing(KorAPUrl), "", httr::parse_url(KorAPUrl)$query$cq),
+ KorAPUrl,
+ metadataOnly = TRUE,
+ ql = ifelse(missing(KorAPUrl), "poliqarp", httr::parse_url(KorAPUrl)$query$ql),
+ fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace",
+ "availability", "textClass", "snippet"),
+ accessRewriteFatal = TRUE,
+ verbose = kco@verbose,
+ as.df = FALSE) {
+ ifelse(length(query) > 1 , {
+ #grid <- expand_grid(query=query, vc=vc)
+ return(
+ do.call(rbind,
+ Map(function(q, cq) corpusQuery(kco, query=q, vc=cq,
+ verbose=verbose, as.df = TRUE), query, vc))
+ )}, {
+ contentFields <- c("snippet")
+ fields <- fields[!fields %in% contentFields]
+ request <- paste0('?q=', URLencode(query, reserved=TRUE),
+ ifelse(vc != '', paste0('&cq=', URLencode(vc, reserved=TRUE)), ''), '&ql=', ql)
+ webUIRequestUrl <- paste0(kco@KorAPUrl, request)
+ requestUrl <- paste0(kco@apiUrl, 'search', request,
+ '&fields=', paste(fields, collapse = ","),
+ ifelse(metadataOnly, '&access-rewrite-disabled=true', ''))
+ log.info(verbose, "Searching \"", query, "\" in \"", vc, "\"", sep="")
+ res = apiCall(kco, paste0(requestUrl, '&count=0'))
+ log.info(verbose, " took ", res$meta$benchmark, "\n", sep="")
+ ifelse(as.df,
+ return(data.frame(query=query,
+ totalResults=res$meta$totalResults,
+ vc=vc,
+ webUIRequestUrl=webUIRequestUrl, stringsAsFactors = FALSE)),
+ return(KorAPQuery(
+ korapConnection = kco,
+ nextStartIndex = 0,
+ fields = fields,
+ requestUrl = requestUrl,
+ request = request,
+ totalResults = res$meta$totalResults,
+ vc = vc,
+ apiResponse = res,
+ webUIRequestUrl = webUIRequestUrl,
+ hasMoreMatches = (res$meta$totalResults > 0),
+ )))})
+ })
#' Fetch the next bunch of results of a KorAP query.
#'
diff --git a/R/reexports.R b/R/reexports.R
index f6624ba..1a4d729 100644
--- a/R/reexports.R
+++ b/R/reexports.R
@@ -10,6 +10,9 @@
#' @importFrom magrittr %>%
#' @usage lhs \%>\% rhs
NULL
+#' @importFrom dplyr bind_cols
+#' @export
+dplyr::bind_cols
#' @importFrom dplyr mutate
#' @export
dplyr::mutate
@@ -25,6 +28,18 @@
#' @importFrom tidyr complete
#' @export
tidyr::complete
+#' @importFrom tidyr expand_grid
+#' @export
+tidyr::expand_grid
#' @importFrom lubridate year
#' @export
lubridate::year
+#' @importFrom tibble as_tibble rownames_to_column
+#' @export
+tibble::as_tibble
+tibble::rownames_to_column
+#' @importFrom broom tidy
+#' @export
+broom::tidy
+#' @importFrom ggplot2 ggplot geom_bar geom_line
+#' @export
diff --git a/man/corpusQuery-KorAPConnection-method.Rd b/man/corpusQuery-KorAPConnection-method.Rd
index e623297..49e636b 100644
--- a/man/corpusQuery-KorAPConnection-method.Rd
+++ b/man/corpusQuery-KorAPConnection-method.Rd
@@ -6,9 +6,16 @@
\alias{corpusQuery}
\title{Method corpusQuery}
\usage{
-\S4method{corpusQuery}{KorAPConnection}(kco, query, vc = "", KorAPUrl,
- metadataOnly = TRUE, ql = "poliqarp", fields = defaultFields,
- accessRewriteFatal = TRUE, verbose = kco@verbose)
+\S4method{corpusQuery}{KorAPConnection}(kco,
+ query = ifelse(missing(KorAPUrl),
+ stop("At least one of the parameters query and KorAPUrl must be specified.",
+ call. = FALSE), httr::parse_url(KorAPUrl)$query$q),
+ vc = ifelse(missing(KorAPUrl), "", httr::parse_url(KorAPUrl)$query$cq),
+ KorAPUrl, metadataOnly = TRUE, ql = ifelse(missing(KorAPUrl),
+ "poliqarp", httr::parse_url(KorAPUrl)$query$ql),
+ fields = c("corpusSigle", "textSigle", "pubDate", "pubPlace",
+ "availability", "textClass", "snippet"), accessRewriteFatal = TRUE,
+ verbose = kco@verbose, as.df = FALSE)
}
\arguments{
\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
@@ -28,6 +35,8 @@
\item{accessRewriteFatal}{abort if query or given vc had to be rewritten due to insufficent rights (not yet implemented).}
\item{verbose}{print some info}
+
+\item{as.df}{return result as data frame instead of as S4 object?}
}
\value{
A \code{\link{KorAPQuery}} object that, among other information, contains the total number of results in \code{@totalResults}. The resulting object can be used to fetch all query results (with \code{\link{fetchAll}}) or the next page of results (with \code{\link{fetchNext}}).
@@ -55,12 +64,12 @@
fetchAll() \%>\%
slot("collectedMatches") \%>\%
mutate(year = lubridate::year(pubDate)) \%>\%
- dplyr::select(year) \%>\%
+ select(year) \%>\%
group_by(year) \%>\%
summarise(Count = n()) \%>\%
mutate(Freq = mapply(function(f, y)
f / corpusStats(kco, paste("pubDate in", y))@tokens, Count, year)) \%>\%
- dplyr::select(-Count) \%>\%
+ select(-Count) \%>\%
complete(year = min(year):max(year), fill = list(Freq = 0)) \%>\%
plot(type = "l")
diff --git a/man/corpusStats-KorAPConnection-method.Rd b/man/corpusStats-KorAPConnection-method.Rd
index 486ee1c..8a13183 100644
--- a/man/corpusStats-KorAPConnection-method.Rd
+++ b/man/corpusStats-KorAPConnection-method.Rd
@@ -7,7 +7,7 @@
\title{Fetch information about a (virtual) corpus}
\usage{
\S4method{corpusStats}{KorAPConnection}(kco, vc = "",
- verbose = kco@verbose)
+ verbose = kco@verbose, as.df = FALSE)
}
\arguments{
\item{kco}{\code{\link{KorAPConnection}} object (obtained e.g. from \code{new("KorAPConnection")}}
@@ -15,6 +15,8 @@
\item{vc}{string describing the virtual corpus. An empty string (default) means the whole corpus, as far as it is license-wise accessible.}
\item{verbose}{logical. If \code{TRUE}, additional diagnostics are printed.}
+
+\item{as.df}{return result as data frame instead of as S4 object?}
}
\value{
\code{KorAPCorpusStats} object with the slots \code{documents}, \code{tokens}, \code{sentences}, \code{paragraphs}
diff --git a/man/reexports.Rd b/man/reexports.Rd
index 81287d2..b92085b 100644
--- a/man/reexports.Rd
+++ b/man/reexports.Rd
@@ -3,12 +3,16 @@
\docType{import}
\name{reexports}
\alias{reexports}
+\alias{bind_cols}
\alias{mutate}
\alias{select}
\alias{group_by}
\alias{summarise}
\alias{complete}
+\alias{expand_grid}
\alias{year}
+\alias{as_tibble}
+\alias{tidy}
\title{Objects exported from other packages}
\keyword{internal}
\description{
@@ -16,10 +20,14 @@
below to see their documentation.
\describe{
- \item{dplyr}{\code{\link[dplyr]{mutate}}, \code{\link[dplyr]{select}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{summarise}}}
+ \item{broom}{\code{\link[broom]{tidy}}}
+
+ \item{dplyr}{\code{\link[dplyr]{bind_cols}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{select}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{summarise}}}
\item{lubridate}{\code{\link[lubridate]{year}}}
- \item{tidyr}{\code{\link[tidyr]{complete}}}
+ \item{tibble}{\code{\link[tibble]{as_tibble}}}
+
+ \item{tidyr}{\code{\link[tidyr]{complete}}, \code{\link[tidyr]{expand_grid}}}
}}