blob: 5410b3bfe0712298e0176594f8234b953b97522a [file] [log] [blame]
Marc Kupietzdcc1de62019-10-04 09:10:36 +02001#' Add confidence interval and relative frequency variables
2#'
3#' Using \code{\link{prop.test}}, \code{ci} adds three columns to a data frame:
Marc Kupietz3f575282019-10-04 14:46:04 +02004#' 1. relative frequency (\code{f})
5#' 2. lower bound of a confidence interval (\code{ci.low})
6#' 3. upper bound of a confidence interval
7#'
Marc Kupietz97a1bca2019-10-04 22:52:09 +02008#'
Marc Kupietz3f575282019-10-04 14:46:04 +02009#' @seealso
10#' \code{ci} is alread included in \code{\link{frequencyQuery}}
Marc Kupietzdcc1de62019-10-04 09:10:36 +020011#'
12#' @param df table with columns for absolute and total frequencies.
13#' @param x column with the observed absolute frequency.
14#' @param N column with the total frequncies
15#' @param conf.level confidence level of the returned confidence interval. Must
16#' be a single number between 0 and 1.
17#'
18#' @export
19#' @importFrom stats prop.test
Marc Kupietz3f575282019-10-04 14:46:04 +020020#' @importFrom tibble remove_rownames
Marc Kupietz97a1bca2019-10-04 22:52:09 +020021#' @importFrom dplyr enquo rename starts_with
Marc Kupietzdcc1de62019-10-04 09:10:36 +020022#' @examples
23#' library(ggplot2)
24#' kco <- new("KorAPConnection", verbose=TRUE)
25#' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>%
26#' bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>%
Marc Kupietz71d6e052019-11-22 18:42:10 +010027#' mutate(total=corpusStats(kco, vc=vc)$tokens) %>%
Marc Kupietzdcc1de62019-10-04 09:10:36 +020028#' ci() %>%
29#' ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) +
30#' geom_point() + geom_line() + geom_ribbon(alpha=.3)
31#'
Marc Kupietz71d6e052019-11-22 18:42:10 +010032ci <- function(df, x = totalResults, N = total, conf.level = 0.95) {
Marc Kupietzdcc1de62019-10-04 09:10:36 +020033 x <- enquo(x)
34 N <- enquo(N)
35 df %>%
36 rowwise %>%
37 mutate(tst = list(broom::tidy(prop.test(!!x, !!N, conf.level = conf.level)) %>%
Marc Kupietz6ebd7512019-10-05 18:21:49 +020038 select("estimate", "conf.low", "conf.high") %>%
Marc Kupietzdcc1de62019-10-04 09:10:36 +020039 rename(f = estimate)
40 )) %>%
41 tidyr::unnest(tst)
42}
43
Marc Kupietz7d613872019-10-04 22:47:20 +020044## Mute notes: "Undefined global functions or variables:"
Marc Kupietz71d6e052019-11-22 18:42:10 +010045globalVariables(c("totalResults", "total", "estimate", "tst"))
Marc Kupietzdcc1de62019-10-04 09:10:36 +020046
47
Marc Kupietz71d6e052019-11-22 18:42:10 +010048# ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) {
Marc Kupietzdcc1de62019-10-04 09:10:36 +020049# x <- deparse(substitute(x))
50# N <- deparse(substitute(N))
51# df <- data.frame(df)
52# df$f <- df[,x] / df[,N]
53# df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int"))
54# return(df)
55# }