Marc Kupietz | e457d99 | 2019-09-29 18:17:05 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | # |
| 3 | # Plot frequency of query expressions over time |
| 4 | # |
| 5 | library(RKorAPClient) |
| 6 | library(ggplot2) |
| 7 | |
| 8 | freqPerYear <- function(query, con = new("KorAPConnection", verbose = TRUE)) { |
| 9 | vc <- "pubDate since 2000 & pubDate until 2018 & textType = /Zeit.*/" |
| 10 | q <- corpusQuery(con, query = query, vc=vc) |
| 11 | q <- fetchAll(q) |
| 12 | tokensPerYear <- function(year) { |
| 13 | return(corpusStats(con, sprintf("%s & pubDate in %s", vc, year))@tokens) |
| 14 | } |
| 15 | df <- as.data.frame(table(as.numeric(format(q@collectedMatches$pubDate,"%Y")), dnn="year"), |
| 16 | stringsAsFactors = FALSE) |
| 17 | df <- merge(data.frame(year=min(df$year):max(df$year)), df, all = TRUE) |
| 18 | df[is.na(df$Freq),]$Freq <- 0 |
| 19 | df$total <- sapply(df$year, tokensPerYear) |
| 20 | df$freq <- df$Freq / df$total |
| 21 | df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int")) |
| 22 | g <- ggplot(data = df, aes(x = year, y = freq, group=1)) + |
| 23 | geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2]), alpha=.3) + |
| 24 | geom_point() + |
| 25 | geom_line() + |
| 26 | xlab("TIME") + |
| 27 | ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) + |
| 28 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
| 29 | print(g) |
| 30 | df |
| 31 | } |
| 32 | #df <- freqPerYear("Car-Bikini") |
| 33 | #df <- freqPerYear("[tt/p=ART & opennlp/p=ART] [tt/l=teilweise] [tt/p=NN]") |
| 34 | df <- freqPerYear("Buschzulage") |
| 35 | |