Make separate examples proper R-style demos

Change-Id: I0ac284cfc1d0c508030c91189f260299680c1485
diff --git a/demo/00Index b/demo/00Index
new file mode 100644
index 0000000..7fe8aff
--- /dev/null
+++ b/demo/00Index
@@ -0,0 +1,5 @@
+frequenciesOverTime       Plot frequency of query expressions over time
+frequenciesOverDomains    Box plot frequency of query expressions per topic domain
+conditionsOverTime        Plot frequency of query expressions over time under different conditions
+alternativesOverTime      Plot proportion of alternative spellings/variants over time
+regional                  Map plot regional frequencies of query expression
diff --git a/demo/alternativesOverTime.R b/demo/alternativesOverTime.R
new file mode 100755
index 0000000..7866121
--- /dev/null
+++ b/demo/alternativesOverTime.R
@@ -0,0 +1,46 @@
+#!/usr/bin/env Rscript
+#
+# Plot frequency of alternative expressions or spellings variants over time
+#
+library(RKorAPClient)
+library(ggplot2)
+library(reshape2)
+library(plotly)
+library(htmlwidgets)
+
+alternativesOverTime <- function(alternatives, years, kco = new("KorAPConnection", verbose=TRUE)) {
+  df = data.frame(year=years)
+  vc = "textType = /Zeit.*/ & pubDate in"
+  urls <- data.frame()
+  for (v in alternatives) {
+    df[v] <- sapply(df$year, function(y) {
+        kqo <- corpusQuery(kco, query=v, vc=paste(vc, y))
+        urls <<- rbind(urls, data.frame(Variant=v, year=y, url=kqo@webUIRequestUrl))
+        kqo@totalResults
+    })
+  }
+  df$total <- apply(df[,alternatives], 1, sum)
+  df <- merge(melt(df, measure.vars = alternatives, value.name = "afreq", variable.name = "Variant"),
+              urls, by=c("Variant", "year"))
+  df$ci <- t(sapply(Map(prop.test, df$afreq, df$total), "[[","conf.int"))
+  df$share <- df$afreq / df$total
+  g <- ggplot(data = df, mapping = aes(x = year, y = share, color=Variant, fill=Variant)) +
+    geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], color=Variant, fill=Variant), alpha=.3, linetype=0) +
+    geom_line() +
+    geom_point() +
+    ggtitle(paste0(alternatives, collapse = " vs. ")) +
+    xlab("TIME") +
+    ylab(sprintf("Observed frequency ratio")) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_x_continuous(breaks=unique(df$year))
+  pp <- ggplotly(g, tooltip = c("x", "y"))
+  for (i in 1:length(alternatives)) {
+    vdata <- df[df$Variant==alternatives[i],]
+    pp$x$data[[2+i]]$customdata <- vdata$url
+    pp$x$data[[2+i]]$text <- sprintf("%s<br />absolute: %d / %d", pp$x$data[[2+i]]$text, vdata$afreq, vdata$total)
+  }
+  ppp <- onRender(pp, "function(el, x) { el.on('plotly_click', function(d) { var url=d.points[0].customdata; window.open(url, 'korap') })}")
+  print(ppp)
+  df
+}
+
+df <- alternativesOverTime(c('so "genannte.?"', '"sogenannte.?"'), (1995:2018))
diff --git a/demo/conditionsOverTime.R b/demo/conditionsOverTime.R
new file mode 100644
index 0000000..ff832e0
--- /dev/null
+++ b/demo/conditionsOverTime.R
@@ -0,0 +1,38 @@
+#!/usr/bin/env Rscript
+#
+# Plot frequency of an expressions under multiple conditions over time
+#
+#library(devtools)
+#install_git("https://korap.ids-mannheim.de/gerrit/KorAP/RKorAPClient", upgrade="never")
+library(RKorAPClient)
+library(ggplot2)
+library(reshape2)
+#library(plotly)
+
+conditionsOverTime <- function(query, conditions, years, kco = new("KorAPConnection", verbose = TRUE)) {
+  df = data.frame(year=years)
+  for (c in conditions) {
+    df[c] <- sapply(df$year, function(y)
+      corpusQuery(kco, query, vc=paste(c, "& pubDate in", y))@totalResults)
+
+  }
+  df <- melt(df, measure.vars = conditions, value.name = "afreq", variable.name = "condition")
+  df$total <- apply(df[,c('year','condition')], 1, function(x) corpusStats(kco, vc=paste(x[2], "& pubDate in", x[1]))@tokens )
+  df$ci <- t(sapply(Map(prop.test, df$afreq, df$total), "[[","conf.int"))
+  df$freq <- df$afreq / df$total
+  g <- ggplot(data = df, mapping = aes(x = year, y = freq, fill=condition, color=condition)) +
+    geom_point() +
+    geom_line() +
+    geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2], fill=condition, color=condition), alpha=.3, linetype=0) +
+    xlab("TIME") +
+    labs(color="Virtual Corpus", fill="Virtual Corpus") +
+    ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))  + scale_x_continuous(breaks=unique(df$year))
+  print(g)
+  # print(ggplotly(g, tooltip = c("x", "y")))
+
+  df
+}
+
+df <- conditionsOverTime("[tt/l=Heuschrecke]", c("textClass = /natur.*/", "textClass=/politik.*/", "textClass=/wirtschaft.*/"), (2002:2018))
+#df <- conditionsOverTime("wegen dem [tt/p=NN]", c("textClass = /sport.*/", "textClass=/politik.*/", "textClass=/kultur.*/"), (1995:2005))
diff --git a/demo/data/regions.rds b/demo/data/regions.rds
new file mode 100644
index 0000000..cee9038
--- /dev/null
+++ b/demo/data/regions.rds
Binary files differ
diff --git a/demo/frequenciesOverDomains.R b/demo/frequenciesOverDomains.R
new file mode 100755
index 0000000..4c5c529
--- /dev/null
+++ b/demo/frequenciesOverDomains.R
@@ -0,0 +1,30 @@
+#!/usr/bin/env Rscript
+#
+# Plot frequency of query expressions per topic domain
+#
+library(RKorAPClient)
+library(ggplot2)
+
+freqPerDomain <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
+  q <- corpusQuery(con, query = query, vc="")
+  q <- fetchAll(q)
+  tokensPerMainTopic <-
+    function(topic) {
+      return(corpusStats(con, sprintf("textClass = /%s.*/", topic))@tokens)
+    }
+  q@collectedMatches$primaryTopic <-
+    sapply(strsplit(as.character(q@collectedMatches$textClass), " "), `[[`, 1)
+  df <- as.data.frame(table(q@collectedMatches$primaryTopic, dnn = "Domain"))
+  df$total <- sapply(df$Domain, tokensPerMainTopic)
+  df$freq <- df$Freq / df$total
+  df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int"))
+  g <- ggplot(data = df, mapping = aes(x = Domain, y = freq)) +
+    geom_col() +
+    geom_errorbar(aes(ymin=ci[, 1], ymax=ci[, 2]), width=.5, alpha=.5) +
+    ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+  print(g)
+  df
+}
+df <- freqPerDomain("Hatespeech")
+
diff --git a/demo/frequenciesOverTime.R b/demo/frequenciesOverTime.R
new file mode 100644
index 0000000..475d2eb
--- /dev/null
+++ b/demo/frequenciesOverTime.R
@@ -0,0 +1,35 @@
+#!/usr/bin/env Rscript
+#
+# Plot frequency of query expressions over time
+#
+library(RKorAPClient)
+library(ggplot2)
+
+freqPerYear <- function(query, con = new("KorAPConnection", verbose = TRUE)) {
+  vc <- "pubDate since 2000 & pubDate until 2018 & textType = /Zeit.*/"
+  q <- corpusQuery(con, query = query, vc=vc)
+  q <- fetchAll(q)
+  tokensPerYear <- function(year) {
+    return(corpusStats(con, sprintf("%s & pubDate in %s", vc, year))@tokens)
+  }
+  df <- as.data.frame(table(as.numeric(format(q@collectedMatches$pubDate,"%Y")), dnn="year"),
+                      stringsAsFactors = FALSE)
+  df <- merge(data.frame(year=min(df$year):max(df$year)), df, all = TRUE)
+  df[is.na(df$Freq),]$Freq <- 0
+  df$total <- sapply(df$year, tokensPerYear)
+  df$freq <- df$Freq / df$total
+  df$ci <- t(sapply(Map(prop.test, df$Freq, df$total), "[[","conf.int"))
+  g <- ggplot(data = df, aes(x = year, y = freq, group=1)) +
+    geom_ribbon(aes(ymin=ci[, 1], ymax=ci[, 2]), alpha=.3) +
+    geom_point() +
+    geom_line() +
+    xlab("TIME") +
+    ylab(sprintf("Observed frequency of \u201c%s\u201d", query)) +
+    theme(axis.text.x = element_text(angle = 45, hjust = 1))
+  print(g)
+  df
+}
+#df <- freqPerYear("Car-Bikini")
+#df <- freqPerYear("[tt/p=ART & opennlp/p=ART] [tt/l=teilweise] [tt/p=NN]")
+df <- freqPerYear("Buschzulage")
+
diff --git a/demo/regional.R b/demo/regional.R
new file mode 100755
index 0000000..067e3e2
--- /dev/null
+++ b/demo/regional.R
@@ -0,0 +1,96 @@
+#!/usr/bin/Rscript
+library(RKorAPClient)
+library(ggplot2)
+library(raster)
+library(broom)
+library(plotly)
+library(htmlwidgets)
+
+devAskNewPage(ask = FALSE)
+mapfile <- "demo/data/cache/map-v2.rds"
+
+fetchAndPrepareMap <- function(map, pick) {
+  cat("Downloading GADM map data for ", map, "\n")
+  sp <- readRDS(url(sprintf("https://biogeo.ucdavis.edu/data/gadm3.6/Rsp/gadm36_%s_sp.rds", map)))
+  if (pick > 0) {
+    sp@polygons <- sp@polygons[pick]
+    sp@data <- sp@data[pick,]
+  }
+  sp
+}
+
+fetchMaps <- function(maps, picks) {
+  if (file.exists(mapfile)) {
+    df <- readRDS(mapfile)
+  } else {
+    cat("Downloading and caching GADM map data.\nPlease note that the GADM map data is licensed for academic use and other non-commercial use, only.\nSee https://gadm.org/license.html\n")
+    df <- broom::tidy(Reduce(bind, mapply(fetchAndPrepareMap, maps, picks)))
+    dir.create(dirname(mapfile), recursive = TRUE, showWarnings = FALSE)
+    saveRDS(df, mapfile)
+  }
+  df$grp <- floor(as.numeric(as.character(df$group)))
+  df
+}
+
+map <- fetchMaps(c("DEU_1", "AUT_0", "CHE_0", "LUX_0", "BEL_3", "ITA_1", "LIE_0"), c(0, 0, 0, 0, 34, 17, 0))
+
+geoDistrib <- function(query, kco = new("KorAPConnection", verbose=TRUE)) {
+  regions <- readRDS("demo/data/regions.rds")
+  regions$freq <- NA
+  regions$url <- NA
+  plot <- NULL
+  vc <- ""
+  for (i in 1:nrow(regions)) {
+    if (!is.na(regions[i,]$query)) {
+      cat(as.character(regions[i,]$region), "\n")
+      regions[i,]$total <- corpusStats(kco, vc=paste0(vc, regions[i,]$query))@tokens
+      if (regions[i,]$total == 0) {
+        regions[i,]$afreq <- 0
+        regions[i,]$freq <- NA
+      } else {
+        kqo <- corpusQuery(kco, query, vc=paste0(vc, regions[i,]$query))
+        regions[i,]$afreq <- kqo@totalResults
+        regions[i,]$freq <- regions[i,]$afreq / regions[i,]$total
+        regions[i,]$url <- kqo@webUIRequestUrl
+      }
+      cat(regions[i,]$afreq, regions[i,]$total, regions[i,]$freq, "\n")
+      plot <- updatePlot(query, map, regions)
+      cat("\n\n")
+    }
+  }
+  pp <- ggplotly(plot)
+  for (i in 1:nrow(regions)) {
+    j <- grep(paste0(regions$region[i], "\""), pp$x$data, perl=TRUE)
+    pp$x$data[[j]]$customdata <- regions[i,]$url
+  }
+  ppp <- onRender(pp, "function(el, x) { el.on('plotly_click', function(d) { var url=d.points[0].data.customdata; window.open(url, 'korap') })}")
+  print(ppp)
+  pp
+}
+
+updatePlot <- function(query, map, regions) {
+  map$ipm <- sapply(map$grp, function(grp) regions$freq[grp] * 10^6)
+  map$region <- sapply(map$grp, function(grp) regions$region[grp])
+  map$url <- sapply(map$grp, function(grp) regions$url[grp])
+  regionsPlot <- ggplot(map) +
+    geom_polygon(aes(x=long, y=lat, group=group, fill=ipm, text=region), colour= "black", size=.1) +
+    theme(axis.line.x = element_blank(),
+          axis.line.y = element_blank(),
+          panel.grid.major = element_blank(),
+          panel.grid.minor = element_blank(),
+          panel.border = element_blank(),
+          panel.background = element_blank(),
+          axis.line=element_blank(),axis.text.x=element_blank(),
+          axis.text.y=element_blank(),axis.ticks=element_blank(),
+          axis.title.x=element_blank(),
+          axis.title.y=element_blank()) +
+    coord_equal(ratio=1.5) +
+    labs(title = sprintf("Regional distribution of \u201c%s\u201d", query))
+  print(regionsPlot)
+  regionsPlot
+}
+
+#geoDistrib("wegen dem [tt/p=NN]")
+geoDistrib("heuer")
+#geoDistrib("Sonnabend")
+#geoDistrib("eh")