Add preliminary report generation
Change-Id: I9a0224649683b5f98a9738953d96214814d4b355
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1cb7a22..90f1772 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -34,12 +34,13 @@
- end_section install_fonts
- start_section install_r_packages "Installing missing R packages"
- - R -e "install.packages(c('RKorAPClient', 'httr', 'tidytext', 'httpuv', 'scales', 'sp', 'raster', 'kableExtra', 'svglite'))"
+ - R -e "install.packages(c('RKorAPClient', 'httr', 'tidytext', 'httpuv', 'scales', 'sp', 'raster', 'kableExtra', 'DT', 'svglite'))"
- R -e 'devtools::install_git("https://korap.ids-mannheim.de/gerrit/IDS-Mannheim/idsThemeR")'
- end_section install_r_packages
script:
- start_section render "Running scripts"
- - R -f ./R/icc_stats.R
+ - R_CACHE_ROOTPATH=./cache R -f ./R/icc_stats.R
+ - R_CACHE_ROOTPATH=./cache R -e "require(rmarkdown); render('R/report.Rmd', output_format='html_document', output_dir='target')"
- end_section render
diff --git a/R/report.Rmd b/R/report.Rmd
new file mode 100644
index 0000000..5f28216
--- /dev/null
+++ b/R/report.Rmd
@@ -0,0 +1,137 @@
+---
+title: "ICC Written Launch"
+output:
+ html_document:
+ css: style.css
+ self_contained: yes
+date: "`r Sys.Date()`"
+---
+
+```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+source("common.R")
+```
+
+# Actual composition of ICC parts
+
+## Composition by ICC genre
+
+```{r composition_by_genre, message = FALSE}
+icc_genre <- icc %>%
+ expand_grid(genre) %>%
+ mutate(vc = paste0("iccGenre=", genre)) %>%
+ rowwise() %>%
+ mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
+ geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids() +
+ geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+
+```
+
+## Composition by date of publication
+
+
+```{r composition_by_pubdate}
+year <- c(1988:2023)
+
+icc_year <- icc %>%
+ expand_grid(year) %>%
+ mutate(vc = paste0("pubDate in ", year)) %>%
+ rowwise() %>%
+ mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
+ geom_line() +
+ geom_point() +
+ scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids()
+```
+
+## Part-of-Speech proportions
+
+```{r pos_proportions}
+POS_tag <- c(
+ "ADJ", "ADP",# "PUNCT",
+ "ADV", "AUX", # "SYM",
+ "INTJ", "CCONJ", # "X",
+ "NOUN", "DET",
+ "PROPN", #"NUM",
+ "VERB", #"PART",
+ "PRON",
+ "SCONJ"
+ )
+
+icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
+ rowwise() %>%
+ mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
+
+icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
+ geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+ theme_ids(base_size = 12) +
+ geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+```
+
+# Pilot study: Identification of Light Verb Constructions with *take*
+
+```{r prepare_ca, output=FALSE, message=FALSE}
+
+
+
+
+```
+
+## English: *take*
+
+```{r take_icc, echo=TRUE, message=FALSE}
+take_ca_icc <-
+ collocationAnalysis(
+ icc_con("eng"),
+ "focus({[ud/l=take]} [ud/p=NOUN])",
+ leftContextSize = 0,
+ rightContextSize = 1,
+ minOccur = 2,
+ addExamples = T
+ )
+
+take_ca_icc %>% show_table()
+```
+
+## German: *nehmen*
+
+```{r nehmen_icc, echo=TRUE}
+nehmen_ca_icc <-
+ collocationAnalysis(
+ icc_con("ger"),
+ "focus([tt/p=NN] {[tt/l=nehmen]})",
+ leftContextSize = 1,
+ rightContextSize = 0,
+ minOccur = 2,
+ addExamples = T
+ )
+nehmen_ca_icc %>% show_table()
+```
+
+### For comparison based on the whole DeReKo
+
+```{r nehmen_dereko}
+nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
+nehmen_ca_dereko %>% show_table()
+```
+
+## Norwegian: *ta*
+
+# ```{r ta_icc, echo=TRUE, message=FALSE}
+ta_ca_icc <-
+ collocationAnalysis(
+ icc_con("nor"),
+ "focus({[ud/l=ta]} [ud/p=NOUN])",
+ leftContextSize = 0,
+ rightContextSize = 1,
+ minOccur = 2,
+ addExamples = T
+ )
+ta_ca_icc %>% show_table()
+```
+
diff --git a/css/style.css b/css/style.css
new file mode 100644
index 0000000..69c1ae7
--- /dev/null
+++ b/css/style.css
@@ -0,0 +1,87 @@
+@import url('//code.cdn.mozilla.net/fonts/fira.css');
+@import url('//korap.ids-mannheim.de/font/libertinus.css');
+
+.main-container {
+ max-width: 1680px !important;
+ margin-left: 0;
+ margin-right: 0;
+}
+
+h1, h2, h3, h4, h5, h6 {
+ font-family: 'Fira Sans',sans-serif;
+ line-height: 1.2;
+ font-weight: 500;
+ /*font-size: 2px;*/
+}
+
+h1 {
+ font-size: 21px;
+}
+
+h2 {
+ font-size: 18px;
+}
+
+h3 {
+ font-size: 15px;
+}
+
+.author, .date {
+ font-size: 18px;
+}
+
+h1.title, h1.subtitle {
+ /*text-transform: uppercase;*/
+ font-size: 24px !important;
+ color: rgb(246, 168, 0);
+ /* text-align: center;*/
+}
+
+.date, .author {
+/* text-align: center;*/
+}
+
+body {
+ font-family: 'Fira Sans', sans-serif;
+ font-size: 18px;
+ font-weight: 400;
+ font-variant-ligatures: common-ligatures;
+ font-variant-numeric: tabular-nums;
+}
+
+th {
+ color: rgb(246, 168, 0);
+}
+
+p {
+ hyphens: auto;
+ text-align: justify;
+ overflow-wrap: break-word;
+}
+
+.footnotes {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 14px;
+ line-height: 1.5;
+}
+
+#TOC > ul {
+ font-family: 'Fira Sans', sans-serif;
+}
+
+.caption {
+ font-family: 'Fira Sans Condensed', sans-serif;
+ font-weight: 400;
+ font-size: 16px;
+ text-align: center;
+}
+
+/*
+table.display td { white-space: nowrap; }
+*/
+
+.dt-buttons, .dataTables_filter {
+ margin-top: 10pt;
+}
+
diff --git a/data/ca_nehmen_dereko b/data/ca_nehmen_dereko
new file mode 100644
index 0000000..e754790
--- /dev/null
+++ b/data/ca_nehmen_dereko
Binary files differ
diff --git a/data/ca_nehmen_dereko_examples b/data/ca_nehmen_dereko_examples
new file mode 100644
index 0000000..09e2042
--- /dev/null
+++ b/data/ca_nehmen_dereko_examples
Binary files differ