Marc Kupietz | a97e8d0 | 2019-09-25 20:06:24 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | # |
| 3 | # Plot frequency of query expressions per topic domain |
| 4 | # |
| 5 | library(RKorAPClient) |
| 6 | library(ggplot2) |
| 7 | |
| 8 | freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) { |
Marc Kupietz | b04a1cb | 2019-10-07 11:22:41 +0200 | [diff] [blame] | 9 | g <- corpusQuery(con, query = query, vc="") %>% |
| 10 | fetchAll() %>% |
| 11 | slot("collectedMatches") %>% |
Marc Kupietz | 1b69abf | 2019-11-08 17:08:55 +0100 | [diff] [blame] | 12 | mutate(Domain = factor(sapply(strsplit(as.character(.$textClass), " "), `[[`, 1))) %>% |
Marc Kupietz | b04a1cb | 2019-10-07 11:22:41 +0200 | [diff] [blame] | 13 | group_by(Domain) %>% |
Marc Kupietz | 1b69abf | 2019-11-08 17:08:55 +0100 | [diff] [blame] | 14 | dplyr::filter(!is.na(Domain)) %>% |
Marc Kupietz | b04a1cb | 2019-10-07 11:22:41 +0200 | [diff] [blame] | 15 | summarise(count = dplyr::n()) %>% |
Marc Kupietz | 71d6e05 | 2019-11-22 18:42:10 +0100 | [diff] [blame^] | 16 | mutate(total = (corpusStats(con, sprintf("textClass = /%s.*/", .$Domain)))$tokens) %>% |
Marc Kupietz | b04a1cb | 2019-10-07 11:22:41 +0200 | [diff] [blame] | 17 | ci(x = count) %>% |
| 18 | ipm() %>% |
| 19 | { df <<- . } %>% |
| 20 | ggplot(aes(x = Domain, y = ipm, ymin = conf.low, ymax = conf.high)) + |
Marc Kupietz | a97e8d0 | 2019-09-25 20:06:24 +0200 | [diff] [blame] | 21 | geom_col() + |
Marc Kupietz | b04a1cb | 2019-10-07 11:22:41 +0200 | [diff] [blame] | 22 | geom_errorbar(width = .3, alpha = .3) + |
| 23 | ylab(sprintf("Observed frequency/million of \u201c%s\u201d", query)) + |
Marc Kupietz | a97e8d0 | 2019-09-25 20:06:24 +0200 | [diff] [blame] | 24 | theme(axis.text.x = element_text(angle = 45, hjust = 1)) |
| 25 | print(g) |
| 26 | df |
| 27 | } |
| 28 | df <- freqPerDomain("Hatespeech") |
| 29 | |