blob: 9abbe076d6cce9ebf283c7da87ee15f3df8a5fa3 [file] [log] [blame]
Marc Kupietza6e4ee62021-03-05 09:00:15 +01001#' Misc functions
2#'
3#' @name misc-functions
4NULL
5#' NULL
Marc Kupietzbb7d2322019-10-06 21:42:34 +02006
7#' Convert corpus frequency table to instances per million.
8#'
9#' Convenience function for converting frequency tables to instances per
10#' million.
11#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020012#' Given a table with columns `f`, `conf.low`, and `conf.high`, `ipm` ads a `column ipm`
13#' und multiplies conf.low and `conf.high` with 10^6.
Marc Kupietzbb7d2322019-10-06 21:42:34 +020014#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020015#' @param df table returned from [frequencyQuery()]
Marc Kupietzbb7d2322019-10-06 21:42:34 +020016#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020017#' @return original table with additional column `ipm` and converted columns `conf.low` and `conf.high`
Marc Kupietzbb7d2322019-10-06 21:42:34 +020018#' @export
19#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010020#' @rdname misc-functions
Marc Kupietzbb7d2322019-10-06 21:42:34 +020021#' @importFrom dplyr .data
22#'
23#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020024#' \dontrun{
25#'
Marc Kupietzbb7d2322019-10-06 21:42:34 +020026#' new("KorAPConnection") %>% frequencyQuery("Test", paste0("pubDate in ", 2000:2002)) %>% ipm()
Marc Kupietz05b22772020-02-18 21:58:42 +010027#' }
Marc Kupietzbb7d2322019-10-06 21:42:34 +020028ipm <- function(df) {
29 df %>%
30 mutate(ipm = .data$f * 10^6, conf.low = .data$conf.low * 10^6, conf.high = .data$conf.high * 10^6)
31}
32
Marc Kupietz23daf5b2019-11-27 10:28:07 +010033#' Convert corpus frequency table of alternatives to percent
34#'
35#' Convenience function for converting frequency tables of alternative variants
Marc Kupietz67edcb52021-09-20 21:54:24 +020036#' (generated with `as.alternatives=TRUE`) to percent.
Marc Kupietz23daf5b2019-11-27 10:28:07 +010037#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020038#' @param df table returned from [frequencyQuery()]
Marc Kupietz23daf5b2019-11-27 10:28:07 +010039#'
Marc Kupietz67edcb52021-09-20 21:54:24 +020040#' @return original table with converted columns `f`, `conf.low` and `conf.high`
Marc Kupietz23daf5b2019-11-27 10:28:07 +010041#' @export
42#'
43#' @importFrom dplyr .data
44#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010045#' @rdname misc-functions
Marc Kupietz23daf5b2019-11-27 10:28:07 +010046#' @examples
Marc Kupietz6ae76052021-09-21 10:34:00 +020047#' \dontrun{
48#'
Marc Kupietz23daf5b2019-11-27 10:28:07 +010049#' new("KorAPConnection") %>%
50#' frequencyQuery(c("Tollpatsch", "Tolpatsch"),
51#' vc=paste0("pubDate in ", 2000:2002),
52#' as.alternatives = TRUE) %>%
53#' percent()
Marc Kupietz05b22772020-02-18 21:58:42 +010054#' }
Marc Kupietz23daf5b2019-11-27 10:28:07 +010055percent <- function(df) {
56 df %>%
57 mutate(f = .data$f * 10^2, conf.low = .data$conf.low * 10^2, conf.high = .data$conf.high * 10^2)
58}
59
Marc Kupietz95240e92019-11-27 18:19:04 +010060#' Convert query or vc strings to plot labels
61#'
62#' Converts a vector of query or vc strings to typically appropriate legend labels
63#' by clipping off prefixes and suffixes that are common to all query strings.
64#'
65#' @param data string or vector of query or vc definition strings
Marc Kupietz62d29a12020-01-18 12:38:36 +010066#' @param pubDateOnly discard all but the publication date
67#' @param excludePubDate discard publication date constraints
Marc Kupietz95240e92019-11-27 18:19:04 +010068#' @return string or vector of strings with clipped off common prefixes and suffixes
69#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +010070#' @rdname misc-functions
71#'
Marc Kupietz95240e92019-11-27 18:19:04 +010072#' @examples
73#' queryStringToLabel(paste("textType = /Zeit.*/ & pubDate in", c(2010:2019)))
74#' queryStringToLabel(c("[marmot/m=mood:subj]", "[marmot/m=mood:ind]"))
75#' queryStringToLabel(c("wegen dem [tt/p=NN]", "wegen des [tt/p=NN]"))
76#'
77#' @importFrom PTXQC lcpCount
78#' @importFrom PTXQC lcsCount
79#'
80#' @export
Marc Kupietzcf1771d2020-03-04 16:03:04 +010081queryStringToLabel <- function(data, pubDateOnly = FALSE, excludePubDate = FALSE) {
Marc Kupietz62d29a12020-01-18 12:38:36 +010082 if (pubDateOnly) {
Marc Kupietz7aa4f192021-03-05 10:51:24 +010083 data <-substring(data, regexpr("(pub|creation)Date", data)+7)
Marc Kupietz62d29a12020-01-18 12:38:36 +010084 } else if(excludePubDate) {
Marc Kupietz7aa4f192021-03-05 10:51:24 +010085 data <-substring(data, 1, regexpr("(pub|creation)Date", data))
Marc Kupietz62d29a12020-01-18 12:38:36 +010086 }
Marc Kupietz95240e92019-11-27 18:19:04 +010087 leftCommon = lcpCount(data)
Marc Kupietz62d29a12020-01-18 12:38:36 +010088 while (leftCommon > 0 && grepl("[[:alnum:]/=.*!]", substring(data[1], leftCommon, leftCommon))) {
Marc Kupietz95240e92019-11-27 18:19:04 +010089 leftCommon <- leftCommon - 1
90 }
91 rightCommon = lcsCount(data)
Marc Kupietz62d29a12020-01-18 12:38:36 +010092 while (rightCommon > 0 && grepl("[[:alnum:]/=.*!]", substring(data[1], 1+nchar(data[1]) - rightCommon, 1+nchar(data[1]) - rightCommon))) {
Marc Kupietz95240e92019-11-27 18:19:04 +010093 rightCommon <- rightCommon - 1
94 }
95 substring(data, leftCommon + 1, nchar(data) - rightCommon)
96}
97
Marc Kupietzbb7d2322019-10-06 21:42:34 +020098
Marc Kupietz865760f2019-10-07 19:29:44 +020099## Mute notes: "Undefined global functions or variables:"
100globalVariables(c("conf.high", "conf.low", "onRender", "webUIRequestUrl"))
101
102
103#' Experimental: Plot frequency by year graphs with confidence intervals
Marc Kupietzd68f9712019-10-06 21:48:00 +0200104#'
Marc Kupietz865760f2019-10-07 19:29:44 +0200105#' Experimental convenience function for plotting typical frequency by year graphs with confidence intervals using ggplot2.
Marc Kupietz67edcb52021-09-20 21:54:24 +0200106#' **Warning:** This function may be moved to a new package.
Marc Kupietz865760f2019-10-07 19:29:44 +0200107#'
108#' @param mapping Set of aesthetic mappings created by aes() or aes_(). If specified and inherit.aes = TRUE (the default), it is combined with the default mapping at the top level of the plot. You must supply mapping if there is no plot mapping.
109#' @param ... Other arguments passed to geom_ribbon, geom_line, and geom_click_point.
Marc Kupietzd68f9712019-10-06 21:48:00 +0200110#'
Marc Kupietza6e4ee62021-03-05 09:00:15 +0100111#' @rdname misc-functions
112#'
Marc Kupietzd68f9712019-10-06 21:48:00 +0200113#' @examples
Marc Kupietz548ac352023-04-18 17:38:37 +0200114#' \dontrun{
Marc Kupietzd68f9712019-10-06 21:48:00 +0200115#' library(ggplot2)
116#' kco <- new("KorAPConnection", verbose=TRUE)
Marc Kupietz6ae76052021-09-21 10:34:00 +0200117#'
Marc Kupietzd68f9712019-10-06 21:48:00 +0200118#' expand_grid(condition = c("textDomain = /Wirtschaft.*/", "textDomain != /Wirtschaft.*/"),
Marc Kupietz2fbac3d2020-01-18 11:01:21 +0100119#' year = (2005:2011)) %>%
Marc Kupietzd68f9712019-10-06 21:48:00 +0200120#' cbind(frequencyQuery(kco, "[tt/l=Heuschrecke]",
121#' paste0(.$condition," & pubDate in ", .$year))) %>%
122#' ipm() %>%
Marc Kupietz865760f2019-10-07 19:29:44 +0200123#' ggplot(aes(year, ipm, fill = condition, color = condition)) +
Marc Kupietzd68f9712019-10-06 21:48:00 +0200124#' geom_freq_by_year_ci()
Marc Kupietz05b22772020-02-18 21:58:42 +0100125#' }
Marc Kupietz865760f2019-10-07 19:29:44 +0200126#' @importFrom ggplot2 ggplot aes geom_ribbon geom_line geom_point theme element_text scale_x_continuous
Marc Kupietzd68f9712019-10-06 21:48:00 +0200127#'
128#' @export
Marc Kupietz865760f2019-10-07 19:29:44 +0200129geom_freq_by_year_ci <- function(mapping = aes(ymin=conf.low, ymax=conf.high), ...) {
Marc Kupietzd68f9712019-10-06 21:48:00 +0200130 list(
Marc Kupietz865760f2019-10-07 19:29:44 +0200131 geom_ribbon(mapping,
132 alpha = .3, linetype = 0, show.legend = FALSE, ...),
133 geom_line(...),
134 geom_click_point(aes(url=webUIRequestUrl), ...),
Marc Kupietz38e4e022024-04-29 17:21:48 +0200135 theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position.inside = c(0.8, 0.2)),
Marc Kupietzd68f9712019-10-06 21:48:00 +0200136 scale_x_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1 + floor(((x[2]-x[1])/30)))))
137}
138
Marc Kupietz5fb892e2021-03-05 08:18:25 +0100139#'
Marc Kupietz865760f2019-10-07 19:29:44 +0200140#' @importFrom ggplot2 ggproto aes GeomPoint
Marc Kupietz5fb892e2021-03-05 08:18:25 +0100141#'
142GeomClickPoint <- ggplot2::ggproto(
Marc Kupietz865760f2019-10-07 19:29:44 +0200143 "GeomPoint",
Marc Kupietz5fb892e2021-03-05 08:18:25 +0100144 ggplot2::GeomPoint,
Marc Kupietz865760f2019-10-07 19:29:44 +0200145 required_aes = c("x", "y"),
146 default_aes = aes(
147 shape = 19, colour = "black", size = 1.5, fill = NA,
148 alpha = NA, stroke = 0.5, url = NA
149 ),
150 extra_params = c("na.rm", "url"),
151 draw_panel = function(data, panel_params,
152 coord, na.rm = FALSE, showpoints = TRUE, url = NULL) {
153 GeomPoint$draw_panel(data, panel_params, coord, na.rm = na.rm)
154 }
155)
156
157#' @importFrom ggplot2 layer
158geom_click_point <- function(mapping = NULL, data = NULL, stat = "identity",
Marc Kupietz5fb892e2021-03-05 08:18:25 +0100159 position = "identity", na.rm = FALSE, show.legend = NA,
160 inherit.aes = TRUE, url = NA, ...) {
Marc Kupietz865760f2019-10-07 19:29:44 +0200161 layer(
162 geom = GeomClickPoint, mapping = mapping, data = data, stat = stat,
163 position = position, show.legend = show.legend, inherit.aes = inherit.aes,
164 params = list(na.rm = na.rm, ...)
165 )
166}
167