blob: 63a058c9bc76a6f879127d576e381a1ba52f9f01 [file] [log] [blame]
Marc Kupietz827a3c12019-09-18 22:09:33 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of alternative expressions or spellings variants over time
4#
5library(RKorAPClient)
6library(ggplot2)
Marc Kupietz69cc54a2019-09-30 12:06:54 +02007library(tidyr)
8library(dplyr)
Marc Kupietz827a3c12019-09-18 22:09:33 +02009library(plotly)
10library(htmlwidgets)
11
12alternativesOverTime <- function(alternatives, years, kco = new("KorAPConnection", verbose=TRUE)) {
Marc Kupietz827a3c12019-09-18 22:09:33 +020013 vc = "textType = /Zeit.*/ & pubDate in"
Marc Kupietz69cc54a2019-09-30 12:06:54 +020014 df <- data.frame(matrix(ncol = length(alternatives), nrow = length(years))) %>%
15 setNames(alternatives) %>%
16 mutate(year = years) %>%
17 pivot_longer(cols = alternatives) %>%
Marc Kupietz296e4932019-10-04 22:51:11 +020018 mutate(value = corpusQuery(kco, query=name, vc=paste(vc, year))$totalResults) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +020019 pivot_wider(id_cols= year, names_from = name) %>%
20 mutate(total = rowSums(.[alternatives])) %>%
21 pivot_longer(cols = alternatives) %>%
22 mutate(share = value / total) %>%
Marc Kupietz296e4932019-10-04 22:51:11 +020023 mutate(url = corpusQuery(kco, query=name, vc=paste(vc, year))$webUIRequestUrl) %>%
Marc Kupietz69cc54a2019-09-30 12:06:54 +020024 rename(Variant = name)
25 df$ci <- t(sapply(Map(prop.test, df$value, df$total), "[[","conf.int"))
Marc Kupietz827a3c12019-09-18 22:09:33 +020026 g <- ggplot(data = df, mapping = aes(x = year, y = share, color=Variant, fill=Variant)) +
27 geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], color=Variant, fill=Variant), alpha=.3, linetype=0) +
28 geom_line() +
29 geom_point() +
30 ggtitle(paste0(alternatives, collapse = " vs. ")) +
31 xlab("TIME") +
32 ylab(sprintf("Observed frequency ratio")) +
33 theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_continuous(breaks=unique(df$year))
34 pp <- ggplotly(g, tooltip = c("x", "y"))
35 for (i in 1:length(alternatives)) {
36 vdata <- df[df$Variant==alternatives[i],]
37 pp$x$data[[2+i]]$customdata <- vdata$url
Marc Kupietz69cc54a2019-09-30 12:06:54 +020038 pp$x$data[[2+i]]$text <- sprintf("%s<br />absolute: %d / %d", pp$x$data[[2+i]]$text, vdata$value, vdata$total)
Marc Kupietz827a3c12019-09-18 22:09:33 +020039 }
40 ppp <- onRender(pp, "function(el, x) { el.on('plotly_click', function(d) { var url=d.points[0].customdata; window.open(url, 'korap') })}")
41 print(ppp)
42 df
43}
44
45df <- alternativesOverTime(c('so "genannte.?"', '"sogenannte.?"'), (1995:2018))