Add preliminary report generation

Change-Id: I9a0224649683b5f98a9738953d96214814d4b355
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1cb7a22..90f1772 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,12 +34,13 @@
     - end_section install_fonts
 
     - start_section install_r_packages "Installing missing R packages"
-    - R -e "install.packages(c('RKorAPClient', 'httr', 'tidytext', 'httpuv', 'scales', 'sp', 'raster', 'kableExtra', 'svglite'))"
+    - R -e "install.packages(c('RKorAPClient', 'httr', 'tidytext', 'httpuv', 'scales', 'sp', 'raster', 'kableExtra', 'DT', 'svglite'))"
     - R -e 'devtools::install_git("https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/idsThemeR")'
     - end_section install_r_packages
 
   script:
     - start_section render "Running scripts"
-    - R -f ./R/icc_stats.R
+    - R_CACHE_ROOTPATH=./cache R -f ./R/icc_stats.R
+    - R_CACHE_ROOTPATH=./cache R -e "require(rmarkdown); render('R/report.Rmd', output_format='html_document', output_dir='target')"
     - end_section render
 
diff --git a/R/report.Rmd b/R/report.Rmd
new file mode 100644
index 0000000..5f28216
--- /dev/null
+++ b/R/report.Rmd
@@ -0,0 +1,137 @@
+---
+title: "ICC Written Launch"
+output:
+  html_document:
+    css: style.css
+    self_contained: yes
+date: "`r Sys.Date()`"
+---
+
+```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+source("common.R")
+```
+
+# Actual composition of ICC parts
+
+## Composition by ICC genre
+
+```{r composition_by_genre, message = FALSE}
+icc_genre <- icc %>%
+  expand_grid(genre) %>%
+  mutate(vc = paste0("iccGenre=", genre)) %>%
+  rowwise() %>%
+  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
+  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids() +
+  geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+
+```
+
+## Composition by date of publication
+
+
+```{r composition_by_pubdate}
+year <- c(1988:2023)
+
+icc_year <- icc %>%
+  expand_grid(year) %>%
+  mutate(vc = paste0("pubDate in ", year)) %>%
+  rowwise() %>%
+  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
+  geom_line() +
+  geom_point() +
+  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids()
+```
+
+## Part-of-Speech proportions
+
+```{r pos_proportions}
+POS_tag <- c(
+  "ADJ", 	"ADP",#	"PUNCT",
+  "ADV",	"AUX",	# "SYM",
+  "INTJ",	"CCONJ", #	"X",
+  "NOUN",	"DET",
+  "PROPN",	#"NUM",
+  "VERB",	#"PART",
+  "PRON",
+  "SCONJ"
+  )
+
+icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
+  rowwise() %>%
+  mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
+
+icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
+  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids(base_size = 12) +
+  geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+```
+
+# Pilot study: Identification of Light Verb Constructions with *take*
+
+```{r prepare_ca, output=FALSE, message=FALSE}
+
+
+
+
+```
+
+## English: *take*
+
+```{r take_icc, echo=TRUE, message=FALSE}
+take_ca_icc <-
+  collocationAnalysis(
+    icc_con("eng"),
+    "focus({[ud/l=take]} [ud/p=NOUN])",
+    leftContextSize = 0,
+    rightContextSize = 1,
+    minOccur = 2,
+    addExamples = T
+  )
+
+take_ca_icc %>% show_table()
+```
+
+## German: *nehmen*
+
+```{r nehmen_icc, echo=TRUE}
+nehmen_ca_icc <-
+  collocationAnalysis(
+    icc_con("ger"),
+    "focus([tt/p=NN] {[tt/l=nehmen]})",
+    leftContextSize = 1,
+    rightContextSize = 0,
+    minOccur = 2,
+    addExamples = T
+  )
+nehmen_ca_icc %>% show_table()
+```
+
+### For comparison based on the whole DeReKo
+
+```{r nehmen_dereko}
+nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
+nehmen_ca_dereko %>% show_table()
+```
+
+## Norwegian: *ta*
+
+# ```{r ta_icc, echo=TRUE, message=FALSE}
+ta_ca_icc <-
+  collocationAnalysis(
+    icc_con("nor"),
+    "focus({[ud/l=ta]} [ud/p=NOUN])",
+    leftContextSize = 0,
+    rightContextSize = 1,
+    minOccur = 2,
+    addExamples = T
+  )
+ta_ca_icc %>% show_table()
+```
+
diff --git a/css/style.css b/css/style.css
new file mode 100644
index 0000000..69c1ae7
--- /dev/null
+++ b/css/style.css
@@ -0,0 +1,87 @@
+@import url('//code.cdn.mozilla.net/fonts/fira.css');
+@import url('//korap.ids-mannheim.de/font/libertinus.css');
+
+.main-container {
+  max-width: 1680px !important;
+  margin-left: 0;
+  margin-right: 0;
+}
+
+h1, h2, h3, h4, h5, h6 {
+    font-family: 'Fira Sans',sans-serif;
+    line-height: 1.2;
+    font-weight: 500;
+    /*font-size: 2px;*/
+}
+
+h1 {
+  font-size: 21px;
+}
+
+h2 {
+  font-size: 18px;
+}
+
+h3 {
+  font-size: 15px;
+}
+
+.author, .date {
+  font-size: 18px;
+}
+
+h1.title, h1.subtitle {
+  /*text-transform: uppercase;*/
+    font-size: 24px  !important;
+  color: rgb(246, 168, 0);
+ /* text-align: center;*/
+}
+
+.date, .author {
+/*  text-align: center;*/
+}
+
+body {
+    font-family: 'Fira Sans', sans-serif;
+    font-size: 18px;
+    font-weight: 400;
+    font-variant-ligatures: common-ligatures;
+    font-variant-numeric: tabular-nums;
+}
+
+th {
+  color: rgb(246, 168, 0);
+}
+
+p {
+  hyphens: auto;
+	text-align: justify;
+  overflow-wrap: break-word;
+}
+
+.footnotes {
+  font-family: 'Fira Sans Condensed', sans-serif;
+  font-weight: 400;
+  font-size: 14px;
+  line-height: 1.5;
+}
+
+#TOC > ul {
+  font-family: 'Fira Sans', sans-serif;
+}
+
+.caption {
+  font-family: 'Fira Sans Condensed', sans-serif;
+  font-weight: 400;
+  font-size: 16px;
+	text-align: center;
+}
+
+/*
+table.display td { white-space: nowrap; }
+*/
+
+.dt-buttons, .dataTables_filter {
+  margin-top: 10pt;
+}
+
diff --git a/data/ca_nehmen_dereko b/data/ca_nehmen_dereko
new file mode 100644
index 0000000..e754790
--- /dev/null
+++ b/data/ca_nehmen_dereko
Binary files differ
diff --git a/data/ca_nehmen_dereko_examples b/data/ca_nehmen_dereko_examples
new file mode 100644
index 0000000..09e2042
--- /dev/null
+++ b/data/ca_nehmen_dereko_examples
Binary files differ