| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 1 | #' Add confidence interval and relative frequency variables | 
|  | 2 | #' | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 3 | #' Using [prop.test()], `ci` adds three columns to a data frame: | 
|  | 4 | #' 1. relative frequency (`f`) | 
|  | 5 | #' 2. lower bound of a confidence interval (`ci.low`) | 
| Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 6 | #' 3. upper bound of a confidence interval | 
|  | 7 | #' | 
| Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 8 | #' | 
| Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 9 | #' @seealso | 
| Marc Kupietz | 67edcb5 | 2021-09-20 21:54:24 +0200 | [diff] [blame] | 10 | #' `ci` is already included in [frequencyQuery()] | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 11 | #' | 
|  | 12 | #' @param df table with columns for absolute and total frequencies. | 
|  | 13 | #' @param x  column with the observed absolute frequency. | 
| Marc Kupietz | 43a6ade | 2020-02-18 17:01:44 +0100 | [diff] [blame] | 14 | #' @param N  column with the total frequencies | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 15 | #' @param conf.level confidence level of the returned confidence interval. Must | 
|  | 16 | #'   be a single number between 0 and 1. | 
|  | 17 | #' | 
| Marc Kupietz | a6e4ee6 | 2021-03-05 09:00:15 +0100 | [diff] [blame] | 18 | #' @rdname misc-functions | 
|  | 19 | #' | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 20 | #' @export | 
|  | 21 | #' @importFrom stats prop.test | 
| Marc Kupietz | 3f57528 | 2019-10-04 14:46:04 +0200 | [diff] [blame] | 22 | #' @importFrom tibble remove_rownames | 
| Marc Kupietz | 97a1bca | 2019-10-04 22:52:09 +0200 | [diff] [blame] | 23 | #' @importFrom dplyr enquo rename starts_with | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 24 | #' @examples | 
| Marc Kupietz | 6ae7605 | 2021-09-21 10:34:00 +0200 | [diff] [blame] | 25 | #' \dontrun{ | 
|  | 26 | #' | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 27 | #' library(ggplot2) | 
|  | 28 | #' kco <- new("KorAPConnection", verbose=TRUE) | 
|  | 29 | #' expand_grid(year=2015:2018, alternatives=c("Hate Speech", "Hatespeech")) %>% | 
|  | 30 | #'   bind_cols(corpusQuery(kco, .$alternatives, sprintf("pubDate in %d", .$year))) %>% | 
| Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 31 | #'   mutate(total=corpusStats(kco, vc=vc)$tokens) %>% | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 32 | #'   ci() %>% | 
|  | 33 | #'   ggplot(aes(x=year, y=f, fill=query, color=query, ymin=conf.low, ymax=conf.high)) + | 
|  | 34 | #'     geom_point() + geom_line() + geom_ribbon(alpha=.3) | 
| Marc Kupietz | 05b2277 | 2020-02-18 21:58:42 +0100 | [diff] [blame] | 35 | #' } | 
| Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 36 | ci <- function(df, | 
|  | 37 | x = totalResults, | 
|  | 38 | N = total, | 
|  | 39 | conf.level = 0.95) { | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 40 | x <- enquo(x) | 
|  | 41 | N <- enquo(N) | 
| Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 42 | nas <- df %>% | 
|  | 43 | dplyr::filter(total <= 0) %>% | 
|  | 44 | mutate(f = NA, conf.low = NA, conf.high = NA) | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 45 | df %>% | 
| Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 46 | dplyr::filter(total > 0) %>% | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 47 | rowwise %>% | 
| Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 48 | mutate(tst = list( | 
|  | 49 | broom::tidy(prop.test(!!x,!!N, conf.level = conf.level)) %>% | 
|  | 50 | select(estimate, conf.low, conf.high) %>% | 
|  | 51 | rename(f = estimate) | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 52 | )) %>% | 
| Marc Kupietz | 53c1b50 | 2020-02-03 22:48:30 +0100 | [diff] [blame] | 53 | tidyr::unnest(tst) %>% | 
|  | 54 | bind_rows(nas) | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 55 | } | 
|  | 56 |  | 
| Marc Kupietz | 7d61387 | 2019-10-04 22:47:20 +0200 | [diff] [blame] | 57 | ## Mute notes: "Undefined global functions or variables:" | 
| Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 58 | globalVariables(c("totalResults", "total", "estimate", "tst")) | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 59 |  | 
|  | 60 |  | 
| Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame] | 61 | # ci.old <- function(df, x = totalResults, N = total, conf.level = 0.95) { | 
| Marc Kupietz | dcc1de6 | 2019-10-04 09:10:36 +0200 | [diff] [blame] | 62 | #   x <- deparse(substitute(x)) | 
|  | 63 | #   N <- deparse(substitute(N)) | 
|  | 64 | #   df <- data.frame(df) | 
|  | 65 | #   df$f <- df[,x] / df[,N] | 
|  | 66 | #   df[, c("conf.low", "conf.high")] <- t(sapply(Map(function(a, b) prop.test(a, b, conf.level = conf.level), df[,x], df[,N]), "[[","conf.int")) | 
|  | 67 | #   return(df) | 
|  | 68 | # } |