Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 1 | #' Add confidence interval and relative frequency variables |
| 2 | #' |
| 3 | #' Using \code{\link{prop.test}}, \code{ci} adds three columns to a data frame: |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 4 | #' 1. relative frequency (\code{f}) |
| 5 | #' 2. lower bound of a confidence interval (\code{ci.low}) |
| 6 | #' 3. upper bound of a confidence interval |
| 7 | #' |
Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 8 | #' |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 9 | #' @seealso |
| 10 | #' \code{ci} is alread included in \code{\link{frequencyQuery}} |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 11 | #' |
| 12 | #' @param df table with columns for absolute and total frequencies. |
| 13 | #' @param x column with the observed absolute frequency. |
| 14 | #' @param N column with the total frequncies |
| 15 | #' @param conf.level confidence level of the returned confidence interval. Must |
| 16 | #' be a single number between 0 and 1. |
| 17 | #' |
| 18 | #' @export |
| 19 | #' @importFrom stats prop.test |
Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 20 | #' @importFrom tibble remove_rownames |
Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 21 | #' @importFrom dplyr enquo rename starts_with |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 22 | #' @examples |
| 23 | #' library(ggplot2) |
| 24 | #' kco <- new("KorAPConnection", verbose=TRUE) |
| 25 | #' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>% |
| 26 | #' bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>% |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame^] | 27 | #' mutate(total=corpusStats(kco, vc=vc)$tokens) %>% |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 28 | #' ci() %>% |
| 29 | #' ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) + |
| 30 | #' geom_point() + geom_line() + geom_ribbon(alpha=.3) |
| 31 | #' |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame^] | 32 | ci <- function(df, x = totalResults, N = total, conf.level = 0.95) { |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 33 | x <- enquo(x) |
| 34 | N <- enquo(N) |
| 35 | df %>% |
| 36 | rowwise %>% |
| 37 | mutate(tst = list(broom::tidy(prop.test(!!x, !!N, conf.level = conf.level)) %>% |
Marc Kupietz | 6ebd751 | 2019-10-05 18:21:49 +0200 | [diff] [blame] | 38 | select("estimate", "conf.low", "conf.high") %>% |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 39 | rename(f = estimate) |
| 40 | )) %>% |
| 41 | tidyr::unnest(tst) |
| 42 | } |
| 43 | |
Marc Kupietz | 7d61387 | 2019-10-04 22:47:20 +0200 | [diff] [blame] | 44 | ## Mute notes: "Undefined global functions or variables:" |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame^] | 45 | globalVariables(c("totalResults", "total", "estimate", "tst")) |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 46 | |
| 47 | |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame^] | 48 | # ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) { |
Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 49 | # x <- deparse(substitute(x)) |
| 50 | # N <- deparse(substitute(N)) |
| 51 | # df <- data.frame(df) |
| 52 | # df$f <- df[,x] / df[,N] |
| 53 | # df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int")) |
| 54 | # return(df) |
| 55 | # } |