blob: ff832e01ca93514e8203d157bf9b8e8a50eb8ca9 [file] [log] [blame]
Marc Kupietz9d57d4b2019-09-25 20:02:37 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of an expressions under multiple conditions over time
4#
5#library(devtools)
6#install_git("https://korap.ids-mannheim.de/gerrit/KorAP/RKorAPClient", upgrade="never")
7library(RKorAPClient)
8library(ggplot2)
9library(reshape2)
10#library(plotly)
11
12conditionsOverTime <- function(query, conditions, years, kco = new("KorAPConnection", verbose = TRUE)) {
13 df = data.frame(year=years)
14 for (c in conditions) {
15 df[c] <- sapply(df$year, function(y)
16 corpusQuery(kco, query, vc=paste(c, "& pubDate in", y))@totalResults)
17
18 }
19 df <- melt(df, measure.vars = conditions, value.name = "afreq", variable.name = "condition")
20 df$total <- apply(df[,c('year','condition')], 1, function(x) corpusStats(kco, vc=paste(x[2], "& pubDate in", x[1]))@tokens )
21 df$ci <- t(sapply(Map(prop.test, df$afreq, df$total), "[[","conf.int"))
22 df$freq <- df$afreq / df$total
23 g <- ggplot(data = df, mapping = aes(x = year, y = freq, fill=condition, color=condition)) +
24 geom_point() +
25 geom_line() +
26 geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], fill=condition, color=condition), alpha=.3, linetype=0) +
27 xlab("TIME") +
28 labs(color="Virtual Corpus", fill="Virtual Corpus") +
Marc Kupietze457d992019-09-29 18:17:05 +020029 ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
Marc Kupietz9d57d4b2019-09-25 20:02:37 +020030 theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_continuous(breaks=unique(df$year))
31 print(g)
32 # print(ggplotly(g, tooltip = c("x", "y")))
33
34 df
35}
36
37df <- conditionsOverTime("[tt/l=Heuschrecke]", c("textClass = /natur.*/", "textClass=/politik.*/", "textClass=/wirtschaft.*/"), (2002:2018))
38#df <- conditionsOverTime("wegen dem [tt/p=NN]", c("textClass = /sport.*/", "textClass=/politik.*/", "textClass=/kultur.*/"), (1995:2005))