Add preliminary report generation
Change-Id: I9a0224649683b5f98a9738953d96214814d4b355
diff --git a/R/report.Rmd b/R/report.Rmd
new file mode 100644
index 0000000..5f28216
--- /dev/null
+++ b/R/report.Rmd
@@ -0,0 +1,137 @@
+---
+title: "ICC Written Launch"
+output:
+ html_document:
+ css: style.css
+ self_contained: yes
+date: "`r Sys.Date()`"
+---
+
+```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+source("common.R")
+```
+
+# Actual composition of ICC parts
+
+## Composition by ICC genre
+
+```{r composition_by_genre, message = FALSE}
+icc_genre <- icc %>%
+ expand_grid(genre) %>%
+ mutate(vc = paste0("iccGenre=", genre)) %>%
+ rowwise() %>%
+ mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
+ geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids() +
+ geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+
+```
+
+## Composition by date of publication
+
+
+```{r composition_by_pubdate}
+year <- c(1988:2023)
+
+icc_year <- icc %>%
+ expand_grid(year) %>%
+ mutate(vc = paste0("pubDate in ", year)) %>%
+ rowwise() %>%
+ mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
+ geom_line() +
+ geom_point() +
+ scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids()
+```
+
+## Part-of-Speech proportions
+
+```{r pos_proportions}
+POS_tag <- c(
+ "ADJ", "ADP",# "PUNCT",
+ "ADV", "AUX", # "SYM",
+ "INTJ", "CCONJ", # "X",
+ "NOUN", "DET",
+ "PROPN", #"NUM",
+ "VERB", #"PART",
+ "PRON",
+ "SCONJ"
+ )
+
+icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
+ rowwise() %>%
+ mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
+
+icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
+ geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids(base_size = 12) +
+ geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+```
+
+# Pilot study: Identification of Light Verb Constructions with *take*
+
+```{r prepare_ca, output=FALSE, message=FALSE}
+
+
+
+
+```
+
+## English: *take*
+
+```{r take_icc, echo=TRUE, message=FALSE}
+take_ca_icc <-
+ collocationAnalysis(
+ icc_con("eng"),
+ "focus({[ud/l=take]} [ud/p=NOUN])",
+ leftContextSize = 0,
+ rightContextSize = 1,
+ minOccur = 2,
+ addExamples = T
+ )
+
+take_ca_icc %>% show_table()
+```
+
+## German: *nehmen*
+
+```{r nehmen_icc, echo=TRUE}
+nehmen_ca_icc <-
+ collocationAnalysis(
+ icc_con("ger"),
+ "focus([tt/p=NN] {[tt/l=nehmen]})",
+ leftContextSize = 1,
+ rightContextSize = 0,
+ minOccur = 2,
+ addExamples = T
+ )
+nehmen_ca_icc %>% show_table()
+```
+
+### For comparison based on the whole DeReKo
+
+```{r nehmen_dereko}
+nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
+nehmen_ca_dereko %>% show_table()
+```
+
+## Norwegian: *ta*
+
+# ```{r ta_icc, echo=TRUE, message=FALSE}
+ta_ca_icc <-
+ collocationAnalysis(
+ icc_con("nor"),
+ "focus({[ud/l=ta]} [ud/p=NOUN])",
+ leftContextSize = 0,
+ rightContextSize = 1,
+ minOccur = 2,
+ addExamples = T
+ )
+ta_ca_icc %>% show_table()
+```
+