Add preliminary report generation Change-Id: I9a0224649683b5f98a9738953d96214814d4b355

commit: 6e21b10d0d94209eac591ba6ea7719bfd75f10fe [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jun 02 18:04:04 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Jun 02 18:46:56 2023 +0200
tree: e99cabeb123addabd48f482c443183655758aecf
parent: 10e197d60227e9b05383294f0c8da5e35b7f978f [diff] [blame]
diff --git a/R/report.Rmd b/R/report.Rmd
new file mode 100644
index 0000000..5f28216
--- /dev/null
+++ b/R/report.Rmd

@@ -0,0 +1,137 @@
+---
+title: "ICC Written Launch"
+output:
+  html_document:
+    css: style.css
+    self_contained: yes
+date: "`r Sys.Date()`"
+---
+
+```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
+knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
+source("common.R")
+```
+
+# Actual composition of ICC parts
+
+## Composition by ICC genre
+
+```{r composition_by_genre, message = FALSE}
+icc_genre <- icc %>%
+  expand_grid(genre) %>%
+  mutate(vc = paste0("iccGenre=", genre)) %>%
+  rowwise() %>%
+  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
+  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids() +
+  geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+
+```
+
+## Composition by date of publication
+
+
+```{r composition_by_pubdate}
+year <- c(1988:2023)
+
+icc_year <- icc %>%
+  expand_grid(year) %>%
+  mutate(vc = paste0("pubDate in ", year)) %>%
+  rowwise() %>%
+  mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
+
+icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
+  geom_line() +
+  geom_point() +
+  scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids()
+```
+
+## Part-of-Speech proportions
+
+```{r pos_proportions}
+POS_tag <- c(
+  "ADJ", 	"ADP",#	"PUNCT",
+  "ADV",	"AUX",	# "SYM",
+  "INTJ",	"CCONJ", #	"X",
+  "NOUN",	"DET",
+  "PROPN",	#"NUM",
+  "VERB",	#"PART",
+  "PRON",
+  "SCONJ"
+  )
+
+icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
+  rowwise() %>%
+  mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
+
+icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
+  geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
+  theme_ids(base_size = 12) +
+  geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
+```
+
+# Pilot study: Identification of Light Verb Constructions with *take*
+
+```{r prepare_ca, output=FALSE, message=FALSE}
+
+
+
+
+```
+
+## English: *take*
+
+```{r take_icc, echo=TRUE, message=FALSE}
+take_ca_icc <-
+  collocationAnalysis(
+    icc_con("eng"),
+    "focus({[ud/l=take]} [ud/p=NOUN])",
+    leftContextSize = 0,
+    rightContextSize = 1,
+    minOccur = 2,
+    addExamples = T
+  )
+
+take_ca_icc %>% show_table()
+```
+
+## German: *nehmen*
+
+```{r nehmen_icc, echo=TRUE}
+nehmen_ca_icc <-
+  collocationAnalysis(
+    icc_con("ger"),
+    "focus([tt/p=NN] {[tt/l=nehmen]})",
+    leftContextSize = 1,
+    rightContextSize = 0,
+    minOccur = 2,
+    addExamples = T
+  )
+nehmen_ca_icc %>% show_table()
+```
+
+### For comparison based on the whole DeReKo
+
+```{r nehmen_dereko}
+nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
+nehmen_ca_dereko %>% show_table()
+```
+
+## Norwegian: *ta*
+
+# ```{r ta_icc, echo=TRUE, message=FALSE}
+ta_ca_icc <-
+  collocationAnalysis(
+    icc_con("nor"),
+    "focus({[ud/l=ta]} [ud/p=NOUN])",
+    leftContextSize = 0,
+    rightContextSize = 1,
+    minOccur = 2,
+    addExamples = T
+  )
+ta_ca_icc %>% show_table()
+```
+
commit	6e21b10d0d94209eac591ba6ea7719bfd75f10fe	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jun 02 18:04:04 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Jun 02 18:46:56 2023 +0200
tree	e99cabeb123addabd48f482c443183655758aecf
parent	10e197d60227e9b05383294f0c8da5e35b7f978f [diff] [blame]