R/report.Rmd - ICC/2023-07-20-ICC-ICLC10 - Gitiles

 ---
 title: "ICC Written Launch"
 output:
   html_document:
     css: style.css
     self_contained: yes
 date: "`r Sys.Date()`"
 ---

 ```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
 knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
 source("common.R")
 ```

 # Actual composition of ICC parts

 ## Composition by ICC genre

 ```{r composition_by_genre, message = FALSE}
 icc_genre <- icc %>%
   expand_grid(genre) %>%
   mutate(vc = paste0("iccGenre=", genre)) %>%
   rowwise() %>%
   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

 icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   theme_ids() +
   scale_fill_ids() +
   geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")

 ```

 ## Composition by date of publication


 ```{r composition_by_pubdate, message=F, warning=F}
 year <- c(1986:2023)

 icc_year <- icc %>%
   expand_grid(year) %>%
   mutate(vc = paste0("pubDate in ", year)) %>%
   rowwise() %>%
   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

 icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
   # geom_smooth(se=F, span=0.25) +
   xlim(1990, 2023) +
   ylim(0, NA) +
   stat_smooth(
         geom = 'area', method = 'loess', span = 1/4,
         alpha = 0.1) +
   # geom_area(alpha=0.1,  position = "identity") +
   scale_fill_ids() + scale_colour_ids() +
   scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   theme_ids()
 ```

 ## Part-of-Speech proportions

 ```{r pos_proportions}
 POS_tag <- c(
   "ADJ", 	"ADP",#	"PUNCT",
   "ADV",	"AUX",	# "SYM",
   # "INTJ",
   "CCONJ", #	"X",
   "NOUN",	"DET",
   "PROPN",	#"NUM",
   "VERB",	#"PART",
   "PRON",
   "SCONJ"
   )

 icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
   rowwise() %>%
   mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)

 icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   scale_fill_ids() + scale_color_ids() +
   theme_ids(base_size = 12) +
   geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
 ```

 # Pilot study: Identification of Light Verb Constructions with *take*

 ```{r prepare_ca, output=FALSE, message=FALSE}


 ```

 ## English: *take*

 ```{r take_icc, echo=TRUE, message=FALSE}
 take_ca_icc <-
   collocationAnalysis(
     icc_con("eng"),
     "focus({[ud/l=take]} [ud/p=NOUN])",
     leftContextSize = 0,
     rightContextSize = 1,
     minOccur = 2,
     addExamples = T
   )

 take_ca_icc %>% show_table()
 ```

 ### For comparison based on English Wikipedia
 #### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/))

 ```{r take_wpe, echo=TRUE}
 if (file.exists("../data/take_ca_wpe")) {
   take_ca_wpe_i <- readRDS("../data/take_ca_wpe")
 } else {
 wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T)
 take_ca_wpe_i <-
   collocationAnalysis(
     wpe,
     "focus({[tt/l=take]} [tt/p=NN])",
     leftContextSize = 0,
     rightContextSize = 1,
     ignoreCollocateCase = TRUE,
     minOccur = 5,
     addExamples = T
   )
 }
 take_ca_wpe_i %>% show_table(max=10000)
 ```

 ## German: *nehmen*

 ```{r nehmen_icc, echo=TRUE}
 nehmen_ca_icc <-
   collocationAnalysis(
     icc_con("ger"),
     "focus([tt/p=NN] {[tt/l=nehmen]})",
     leftContextSize = 1,
     rightContextSize = 0,
     minOccur = 2,
     addExamples = T
   )
 nehmen_ca_icc %>% show_table()
 ```

 ### For comparison based on the whole DeReKo

 ```{r nehmen_dereko}
 nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
 nehmen_ca_dereko %>% show__full_table()
 ```

 ## Norwegian: *ta*

 ```{r ta_icc, echo=T, message=FALSE}
 ta_ca_icc <-
   collocationAnalysis(
     icc_con("nor"),
     "focus({[ud/l=ta]} [ud/p=NOUN])",
     leftContextSize = 0,
     rightContextSize = 1,
     minOccur = 2,
     addExamples = T
   )
 ta_ca_icc %>% show_table()
 ```
	---
	title: "ICC Written Launch"
	output:
	html_document:
	css: style.css
	self_contained: yes
	date: "`r Sys.Date()`"
	---

	```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
	knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
	source("common.R")
	```

	# Actual composition of ICC parts

	## Composition by ICC genre

	```{r composition_by_genre, message = FALSE}
	icc_genre <- icc %>%
	expand_grid(genre) %>%
	mutate(vc = paste0("iccGenre=", genre)) %>%
	rowwise() %>%
	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

	icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	theme_ids() +
	scale_fill_ids() +
	geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")

	```

	## Composition by date of publication


	```{r composition_by_pubdate, message=F, warning=F}
	year <- c(1986:2023)

	icc_year <- icc %>%
	expand_grid(year) %>%
	mutate(vc = paste0("pubDate in ", year)) %>%
	rowwise() %>%
	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

	icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
	# geom_smooth(se=F, span=0.25) +
	xlim(1990, 2023) +
	ylim(0, NA) +
	stat_smooth(
	geom = 'area', method = 'loess', span = 1/4,
	alpha = 0.1) +
	# geom_area(alpha=0.1, position = "identity") +
	scale_fill_ids() + scale_colour_ids() +
	scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	theme_ids()
	```

	## Part-of-Speech proportions

	```{r pos_proportions}
	POS_tag <- c(
	"ADJ", "ADP",# "PUNCT",
	"ADV", "AUX", # "SYM",
	# "INTJ",
	"CCONJ", # "X",
	"NOUN", "DET",
	"PROPN", #"NUM",
	"VERB", #"PART",
	"PRON",
	"SCONJ"
	)

	icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
	rowwise() %>%
	mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)

	icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	scale_fill_ids() + scale_color_ids() +
	theme_ids(base_size = 12) +
	geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
	```

	# Pilot study: Identification of Light Verb Constructions with take

	```{r prepare_ca, output=FALSE, message=FALSE}




	```

	## English: take

	```{r take_icc, echo=TRUE, message=FALSE}
	take_ca_icc <-
	collocationAnalysis(
	icc_con("eng"),
	"focus({[ud/l=take]} [ud/p=NOUN])",
	leftContextSize = 0,
	rightContextSize = 1,
	minOccur = 2,
	addExamples = T
	)

	take_ca_icc %>% show_table()
	```

	### For comparison based on English Wikipedia
	#### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/))

	```{r take_wpe, echo=TRUE}
	if (file.exists("../data/take_ca_wpe")) {
	take_ca_wpe_i <- readRDS("../data/take_ca_wpe")
	} else {
	wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T)
	take_ca_wpe_i <-
	collocationAnalysis(
	wpe,
	"focus({[tt/l=take]} [tt/p=NN])",
	leftContextSize = 0,
	rightContextSize = 1,
	ignoreCollocateCase = TRUE,
	minOccur = 5,
	addExamples = T
	)
	}
	take_ca_wpe_i %>% show_table(max=10000)
	```

	## German: nehmen

	```{r nehmen_icc, echo=TRUE}
	nehmen_ca_icc <-
	collocationAnalysis(
	icc_con("ger"),
	"focus([tt/p=NN] {[tt/l=nehmen]})",
	leftContextSize = 1,
	rightContextSize = 0,
	minOccur = 2,
	addExamples = T
	)
	nehmen_ca_icc %>% show_table()
	```

	### For comparison based on the whole DeReKo

	```{r nehmen_dereko}
	nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
	nehmen_ca_dereko %>% show__full_table()
	```

	## Norwegian: ta

	```{r ta_icc, echo=T, message=FALSE}
	ta_ca_icc <-
	collocationAnalysis(
	icc_con("nor"),
	"focus({[ud/l=ta]} [ud/p=NOUN])",
	leftContextSize = 0,
	rightContextSize = 1,
	minOccur = 2,
	addExamples = T
	)
	ta_ca_icc %>% show_table()
	```