Marc Kupietz | a97e8d0 | 2019-09-25 20:06:24 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | # |
| 3 | # Plot frequency of query expressions per topic domain |
| 4 | # |
| 5 | library(RKorAPClient) |
| 6 | library(ggplot2) |
| 7 | |
| 8 | freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) { |
| 9 | q <- corpusQuery(con, query = query, vc="") |
| 10 | q <- fetchAll(q) |
| 11 | tokensPerMainTopic <- |
| 12 | function(topic) { |
| 13 | return(corpusStats(con, sprintf("textClass = /%s.*/", topic))@tokens) |
| 14 | } |
| 15 | q@collectedMatches$primaryTopic <- |
| 16 | sapply(strsplit(as.character(q@collectedMatches$textClass), " "), `[[`, 1) |
| 17 | df <- as.data.frame(table(q@collectedMatches$primaryTopic, dnn = "Domain")) |
| 18 | df$total <- sapply(df$Domain, tokensPerMainTopic) |
| 19 | df$freq <- df$Freq / df$total |
| 20 | df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int")) |
| 21 | g <- ggplot(data = df, mapping = aes(x = Domain, y = freq)) + |
| 22 | geom_col() + |
| 23 | geom_errorbar(aes(ymin=ci[, 1], ymax=ci[, 2]), width=.5, alpha=.5) + |
| 24 | ylab(sprintf("Observed frequency of ā%sā", query)) + |
| 25 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
| 26 | print(g) |
| 27 | df |
| 28 | } |
| 29 | df <- freqPerDomain("Hatespeech") |
| 30 | |