Initial import

Change-Id: I6b958bbd011b77c870f4e01793a9e374505ffccc
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
new file mode 100644
index 0000000..92f7eec
--- /dev/null
+++ b/.gitlab-ci.yml
@@ -0,0 +1,38 @@
+# use the verse rocker image, as it contains tidyverse, devtools and some texlive
+image: rocker/tidyverse
+
+# define stages of runner. at the moment,
+# just build (no test or deploy).
+stages:
+  - build
+
+build-job:
+  stage: build
+
+  cache:
+    key: korap
+    paths:
+      - ./cache
+
+  before_script:
+    - source `find .. -name section_helper.sh`
+
+    - start_section install_linux_packages "Installing missing Linux packages"
+    - apt-get update
+    - apt-get install -y libvulkan1 libu2f-udev build-essential libglpk40 libcurl4-gnutls-dev libxml2-dev libsodium-dev libsecret-1-dev libfontconfig1-dev libssl-dev libxt6 libpq-dev curl
+    - end_section install_linux_packages
+
+    - start_section install_r_packages "Installing missing R packages"
+    - R -e "install.packages(c('devtools', 'RKorAPClient', 'httr', 'shiny', 'shinythemes', 'highcharter'))"
+    - R -e 'devtools::install_git("https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/idsThemeR")'
+    - end_section install_r_packages
+
+  script:
+    - start_section render "Running scripts"
+    - echo 'options(shiny.port=18000)' >> ~/.Rprofile
+    - R_CACHE_ROOTPATH=./cache Rscript shinyCorpusComposition.R &
+    - PID=$!
+    - sleep 10
+    - curl http://127.0.0.1:18000/
+    - kill $PID
+    - end_section render
diff --git a/CorpusCompositionAnalyzer.Rproj b/CorpusCompositionAnalyzer.Rproj
new file mode 100644
index 0000000..e83436a
--- /dev/null
+++ b/CorpusCompositionAnalyzer.Rproj
@@ -0,0 +1,16 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
diff --git a/ci/section_helper.sh b/ci/section_helper.sh
new file mode 100644
index 0000000..ddefe0d
--- /dev/null
+++ b/ci/section_helper.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Reference: https://docs.gitlab.com/ee/ci/jobs/#custom-collapsible-sections
+
+#
+# Takes 2 Parameters a new section id and a heading/title
+#
+function start_section() {
+  id=$1
+  title=$2
+  echo -e "\e[0Ksection_start:$(date +%s):${id}[collapsed=true]\r\e[0K\e[36;1m${title}\e[0m"
+}
+
+#
+# Takes 1 Parameter, the unique section id of the section that should end
+#
+function end_section() {
+  id=$1
+  echo -e "\e[0Ksection_end:$(date +%s):${id}\r\e[0K"
+}
diff --git a/shinyCorpusComposition.R b/shinyCorpusComposition.R
new file mode 100644
index 0000000..3c8bccb
--- /dev/null
+++ b/shinyCorpusComposition.R
@@ -0,0 +1,152 @@
+library(shiny)
+library(shinythemes)
+library(highcharter)
+library(RKorAPClient)
+library(tidyverse)
+library(idsThemeR)
+
+ui <- fluidPage(
+
+  theme = shinytheme("paper"),
+  fluidRow(
+    column(width = 6, highchartOutput("country")),
+    column(width = 6, highchartOutput("domain")),
+    column(width = 6, highchartOutput("decade")),
+    column(width = 6, highchartOutput("texttype")),
+  ) %>% tagAppendAttributes(class="hc-link-legend")
+
+)
+
+server <- function(input, output, session) {
+
+  observe({
+    query <- parseQueryString(session$clientData$url_search)
+    if (!is.null(query[['cq']])) {
+      message(query[['cq']])
+    }
+  })
+
+  sharelegend = JS('function(event){
+    var vis = this.visible;
+    var conall = $(this.chart.container).parents(".hc-link-legend").find("div.highchart");
+    for(var i = 0; i < conall.length; i++){
+      var hc = $(conall[i]).highcharts();
+      var series = hc.series[this.index];
+      if(series){
+        if(vis){
+          series.hide();
+        } else{
+          series.show();
+        }
+      }
+    }
+    return false;
+  }')
+
+  corpus=c("", "referTo ratskorpus-2023-1 & ", "referTo drukola.20180909.1b_words & ")
+  kco <- new("KorAPConnection", verbose=TRUE)
+  highchart <- function(...) {
+    highcharter::highchart() %>%
+      hc_add_theme(hc_theme_ids_light()) %>%
+      hc_add_onclick_korap_search() %>%
+      hc_plotOptions(series = list(events = list(legendItemClick = sharelegend)))
+  }
+
+  prettifyCorpusNames <- function(df) {
+    rownames(df) = NULL
+    df %>%
+      mutate(corpus = corpus %>% str_replace("referTo *", "") %>% str_replace(" *& *$", "") |> str_replace("^ *$", "DeReKo-KorAP"))
+
+  }
+
+
+  output$country <- renderHighchart({
+
+    countries <- c("DE", "AT", "CH", "IT", "BE", "LU") %>% sort()
+
+    df <- expand_grid(corpus=corpus, country=countries) %>%
+      mutate(vc = sprintf("%spubPlaceKey=%s", corpus, country)) %>%
+      prettifyCorpusNames() %>%
+      bind_cols(corpusStats(kco, .$vc) %>% select(-vc))
+
+    highchart() %>%
+      hc_add_series(type = "column", data = df, hcaes(x=country, y=tokens, group=corpus)) %>%
+      hc_xAxis(categories = df$country) %>%
+      hc_yAxis(type = "logarithmic") %>%
+      hc_legend(enabled=T) %>%
+      hc_title(text="Land")
+
+  })
+
+  output$domain <- renderHighchart({
+    topics <-
+      c(
+        "freizeit-unterhaltung",
+        "gesundheit-ernaehrung",
+        "kultur",
+        "politik",
+        "sport",
+        "staat-gesellschaft",
+        "technik-industrie",
+        "wissenschaft",
+        "wirtschaft-finanzen",
+        "natur-umwelt",
+        "fiktion"
+      )
+
+    df <- expand_grid(corpus=corpus, domain=topics) %>%
+      mutate(vc = sprintf("%stextClass=%s", corpus, domain)) %>%
+      bind_cols(corpusStats(kco, .$vc)%>% select(-vc)) %>%
+      prettifyCorpusNames()
+
+    highchart() %>%
+      hc_add_series(type = "bar", data = df, hcaes(domain, tokens, group=corpus)) %>%
+      hc_xAxis(categories = df$domain %>% str_to_title(locale = "en") )%>%
+      hc_yAxis(type = "logarithmic") %>%
+      hc_legend(enabled=F) %>%
+      hc_title(text="Thema")
+
+  })
+
+  output$decade <- renderHighchart({
+    decades <-
+      c(1951, 1961, 1971, 1981, 1991, 2001, 2011, 2021)
+    decade_labels <- function(start_year) {
+      sprintf("%d-%d", start_year, start_year+9)
+    }
+
+    df <- expand_grid(corpus=corpus, decade=decades) %>%
+      mutate(vc = sprintf("%spubDate since %d & pubDate until %d", corpus, decade, decade+9)) %>%
+      bind_cols(corpusStats(kco, .$vc) %>% select(-vc)) %>%
+      mutate(decade = decade_labels(decade)) %>%
+      prettifyCorpusNames()
+
+    highchart() %>%
+      hc_add_series(type = "bar", data = df, hcaes(decade, tokens, group=corpus)) %>%
+      hc_xAxis(categories = df$decade )%>%
+      hc_yAxis(type = "logarithmic") %>%
+      hc_legend(enabled=F) %>%
+      hc_title(text="Dekade")
+  })
+
+  output$texttype <- renderHighchart({
+    texttypes <-
+      c("/Zeitung.*/", "/(Zeitschrift|Magazin).*/", "/Agenturmeldung.*/", "/Enzyklopädie.*/", "/.*Diskussion.*/", "/Roman.*/", "/Newsgroup.*/", "/Tagebuch.*/", "/Sachbuch.*/")
+
+    df <- expand_grid(corpus=corpus, texttype=texttypes) %>%
+      mutate(vc = sprintf("%stextType=%s", corpus, texttype)) %>%
+      bind_cols(corpusStats(kco, .$vc) %>% select(-vc)) %>%
+      prettifyCorpusNames()
+
+    hc <- highchart() %>%
+      hc_add_series(type = "bar", data = df, hcaes(texttype, tokens, group=corpus)) %>%
+      hc_xAxis(categories = df$texttype %>% str_replace_all("[/.*)()]", "") %>% str_replace_all("\\|", "/")) %>%
+      hc_yAxis(type = "logarithmic") %>%
+      hc_legend(enabled=F) %>%
+      hc_title(text="Texttyp")
+    hc
+  })
+
+}
+
+shinyApp(ui, server)