blob: cf02649d6245cd27ce3cc86e54786d22409a279d [file] [log] [blame]
Marc Kupietza97e8d02019-09-25 20:06:24 +02001#!/usr/bin/env Rscript
2#
3# Plot frequency of query expressions per topic domain
4#
5library(RKorAPClient)
6library(ggplot2)
7
8freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
Marc Kupietzb04a1cb2019-10-07 11:22:41 +02009 g <- corpusQuery(con, query = query, vc="") %>%
10 fetchAll() %>%
11 slot("collectedMatches") %>%
Marc Kupietz1b69abf2019-11-08 17:08:55 +010012 mutate(Domain = factor(sapply(strsplit(as.character(.$textClass), " "), `[[`, 1))) %>%
Marc Kupietzb04a1cb2019-10-07 11:22:41 +020013 group_by(Domain) %>%
Marc Kupietz1b69abf2019-11-08 17:08:55 +010014 dplyr::filter(!is.na(Domain)) %>%
Marc Kupietzb04a1cb2019-10-07 11:22:41 +020015 summarise(count = dplyr::n()) %>%
Marc Kupietz71d6e052019-11-22 18:42:10 +010016 mutate(total = (corpusStats(con, sprintf("textClass = /%s.*/", .$Domain)))$tokens) %>%
Marc Kupietzb04a1cb2019-10-07 11:22:41 +020017 ci(x = count) %>%
18 ipm() %>%
19 { df <<- . } %>%
20 ggplot(aes(x = Domain, y = ipm, ymin = conf.low, ymax = conf.high)) +
Marc Kupietza97e8d02019-09-25 20:06:24 +020021 geom_col() +
Marc Kupietzb04a1cb2019-10-07 11:22:41 +020022 geom_errorbar(width = .3, alpha = .3) +
23 ylab(sprintf("Observed frequency/million of \u201c%s\u201d", query)) +
Marc Kupietza97e8d02019-09-25 20:06:24 +020024 theme(axis.text.x = element_text(angle = 45, hjust = 1))
25 print(g)
26 df
27}
28df <- freqPerDomain("Hatespeech")
29