R/poster.Rmd - ICC/2023-07-20-ICC-ICLC10 - Gitiles

 ---
 title: "News from the International Comparable Corpus"
 subtitle: "First launch of ICC written"
 date: "`r Sys.Date()`"
 author:
     - name: Marc Kupietz
       affil: 1
     - name: Adrien Barbaresi
       affil: 2
     - name: Anna Čermáková
       affil: 3
     - name: Małgorzata Czachor
       affil: 4
     - name: Nils Diewald
       affil: 1
     - name: Jarle Ebeling
       affil: 5
     - name: Rafał L. Górski
       affil: 4
     - name: John Kirk
       affil: 6
     - name: Michal Křen
       affil: 3
     - name: Harald Lüngen
       affil: 1
     - name: Eliza Margaretha
       affil: 1
     - name: Signe Oksefjell Ebeling
       affil: 5
     - name: Mícheál Ó Meachair
       affil: 7
     - name: Ines Pisetta
       affil: 1
     - name: Elaine Uí Dhonnchadha
       affil: 8
     - name: Friedemann Vogel
       affil: 9
     - name: Rebecca Wilm
       affil: 1
     - name: Jiajin Xu
       affil: 10
     - name: Rameela Yaddehige
       affil: 1
 affiliation:
   - num: 1
     address: IDS Mannheim
   - num: 2
     address: BBAW Berlin
   - num: 3
     address: Charles University
   - num: 4
     address: Polish Academy of Sciences
   - num: 5
     address: University of Oslo
   - num: 6
     address: University of Vienna
   - num: 7
     address: Dublin City University
   - num: 8
     address: Trinity College Dublin
   - num: 9
     address: University of Siegen
   - num: 10
     address: Beijing Foreign Studies University


 logoleft_name: "../Figures/ICC_COL.svg"
 author_textsize: "32pt"

 contact:
   qrcode: icc_qrcode.svg

 output:
   posterdown::posterdown_ids:
         self_contained: false
         keep_md: true

 bibliography: ../tex/references.bib
 csl: ids.csl
 ---

 ```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
 library(qrcode)
 knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warnings = FALSE)
 source("common.R")
 generate_svg(qr_code("https://korap.ids-mannheim.de/instance/icc"), "icc_qrcode.svg")
 ```
 # ICC aims & charcteristics

 * open initiative [@cermakova_international_2021]
 * to improve the empirical basis for contrastive linguistics
 * by compiling comparable corpora for many languages
 * and making them as freely available as possible
 * also by providing tools to query and analyse them
 * mostly based on existing corpora
 * mimics the composition of ICE

 # Current alpha launch

 ## Composition of parts
 ### By ICC genre

 ```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
 icc_genre <- icc %>%
   expand_grid(genre) %>%
   mutate(vc = paste0("iccGenre=", genre)) %>%
   rowwise() %>%
   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

 icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   theme_ids(base_size = 24) +
   theme(
     axis.title.x = element_text(size = rel(1.5), face = "bold"),
     axis.title.y = element_text(size = rel(1.5), face = "bold"),
      axis.text = element_text(size = rel(0.70)),
     legend.title = element_text(size = rel(0.85), face = "bold"),
     legend.text = element_text(size = rel(1))) +
   scale_fill_ids() +
   geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")

 ```

 ### By date of publication


 ```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
 year <- c(1986:2023)

 icc_year <- icc %>%
   expand_grid(year) %>%
   mutate(vc = paste0("pubDate in ", year)) %>%
   rowwise() %>%
   mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

 icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
   # geom_smooth(se=F, span=0.25) +
   xlim(1990, 2023) +
   ylim(0, NA) +
   stat_smooth(
         geom = 'area', method = 'loess', span = 1/4,
         alpha = 0.1) +
   # geom_area(alpha=0.1,  position = "identity") +
   scale_fill_ids() + scale_colour_ids() +
   scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   theme_ids(base_size=24) +
     theme(
     axis.title.x = element_text(size = rel(1.5), face = "bold"),
     axis.title.y = element_text(size = rel(1.5), face = "bold"),
      axis.text = element_text(size = rel(1)),
     legend.title = element_text(size = rel(1), face = "bold"),
     legend.text = element_text(size = rel(1)))
 ```

 ### Part-of-Speech proportions

 ```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
 POS_tag <- c(
   "ADJ", 	"ADP",#	"PUNCT",
   "ADV",	"AUX",	# "SYM",
   # "INTJ",
   "CCONJ", #	"X",
   "NOUN",	"DET",
   "PROPN",	#"NUM",
   "VERB",	#"PART",
   "PRON",
   "SCONJ"
   )

 icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
   rowwise() %>%
   mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)

 icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
   geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
   scale_fill_ids() + scale_color_ids() +
   theme_ids(base_size=24) +
     theme(
     axis.title.x = element_text(size = rel(1.5), face = "bold"),
     axis.title.y = element_text(size = rel(1.5), face = "bold"),
      axis.text = element_text(size = rel(1)),
     legend.title = element_text(size = rel(1), face = "bold"),
     legend.text = element_text(size = rel(1))) +
   geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
 ```

 # Identification of Light Verb Constructions with *take*


 ## English: *take*

 ```{r take_icc, echo=TRUE, message=FALSE}
 take_ca_icc <-
   collocationAnalysis(
     icc_con("eng"),
     "focus({[ud/l=take]} [ud/p=NOUN])",
     leftContextSize = 0,
     rightContextSize = 1,
     minOccur = 2,
     addExamples = T
   )

 take_ca_icc %>% show_table()
 ```

 # References
	---
	title: "News from the International Comparable Corpus"
	subtitle: "First launch of ICC written"
	date: "`r Sys.Date()`"
	author:
	- name: Marc Kupietz
	affil: 1
	- name: Adrien Barbaresi
	affil: 2
	- name: Anna Čermáková
	affil: 3
	- name: Małgorzata Czachor
	affil: 4
	- name: Nils Diewald
	affil: 1
	- name: Jarle Ebeling
	affil: 5
	- name: Rafał L. Górski
	affil: 4
	- name: John Kirk
	affil: 6
	- name: Michal Křen
	affil: 3
	- name: Harald Lüngen
	affil: 1
	- name: Eliza Margaretha
	affil: 1
	- name: Signe Oksefjell Ebeling
	affil: 5
	- name: Mícheál Ó Meachair
	affil: 7
	- name: Ines Pisetta
	affil: 1
	- name: Elaine Uí Dhonnchadha
	affil: 8
	- name: Friedemann Vogel
	affil: 9
	- name: Rebecca Wilm
	affil: 1
	- name: Jiajin Xu
	affil: 10
	- name: Rameela Yaddehige
	affil: 1
	affiliation:
	- num: 1
	address: IDS Mannheim
	- num: 2
	address: BBAW Berlin
	- num: 3
	address: Charles University
	- num: 4
	address: Polish Academy of Sciences
	- num: 5
	address: University of Oslo
	- num: 6
	address: University of Vienna
	- num: 7
	address: Dublin City University
	- num: 8
	address: Trinity College Dublin
	- num: 9
	address: University of Siegen
	- num: 10
	address: Beijing Foreign Studies University


	logoleft_name: "../Figures/ICC_COL.svg"
	author_textsize: "32pt"

	contact:
	qrcode: icc_qrcode.svg

	output:
	posterdown::posterdown_ids:
	self_contained: false
	keep_md: true

	bibliography: ../tex/references.bib
	csl: ids.csl
	---

	```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
	library(qrcode)
	knitr::opts_chunk$set(dev = 'svg', echo = FALSE, warnings = FALSE)
	source("common.R")
	generate_svg(qr_code("https://korap.ids-mannheim.de/instance/icc"), "icc_qrcode.svg")
	```
	# ICC aims & charcteristics

	* open initiative [@cermakova_international_2021]
	* to improve the empirical basis for contrastive linguistics
	* by compiling comparable corpora for many languages
	* and making them as freely available as possible
	* also by providing tools to query and analyse them
	* mostly based on existing corpora
	* mimics the composition of ICE

	# Current alpha launch

	## Composition of parts
	### By ICC genre

	```{r composition_by_genre, message = FALSE, fig.width=14, fig.height=10, out.width = "100%"}
	icc_genre <- icc %>%
	expand_grid(genre) %>%
	mutate(vc = paste0("iccGenre=", genre)) %>%
	rowwise() %>%
	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

	icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	theme_ids(base_size = 24) +
	theme(
	axis.title.x = element_text(size = rel(1.5), face = "bold"),
	axis.title.y = element_text(size = rel(1.5), face = "bold"),
	axis.text = element_text(size = rel(0.70)),
	legend.title = element_text(size = rel(0.85), face = "bold"),
	legend.text = element_text(size = rel(1))) +
	scale_fill_ids() +
	geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")

	```

	### By date of publication


	```{r composition_by_pubdate, message=F, warning=F, fig.width=14, fig.height=7, out.width = "100%"}
	year <- c(1986:2023)

	icc_year <- icc %>%
	expand_grid(year) %>%
	mutate(vc = paste0("pubDate in ", year)) %>%
	rowwise() %>%
	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)

	icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
	# geom_smooth(se=F, span=0.25) +
	xlim(1990, 2023) +
	ylim(0, NA) +
	stat_smooth(
	geom = 'area', method = 'loess', span = 1/4,
	alpha = 0.1) +
	# geom_area(alpha=0.1, position = "identity") +
	scale_fill_ids() + scale_colour_ids() +
	scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	theme_ids(base_size=24) +
	theme(
	axis.title.x = element_text(size = rel(1.5), face = "bold"),
	axis.title.y = element_text(size = rel(1.5), face = "bold"),
	axis.text = element_text(size = rel(1)),
	legend.title = element_text(size = rel(1), face = "bold"),
	legend.text = element_text(size = rel(1)))
	```

	### Part-of-Speech proportions

	```{r pos_proportions, fig.width=14, fig.height=10, out.width = "100%"}
	POS_tag <- c(
	"ADJ", "ADP",# "PUNCT",
	"ADV", "AUX", # "SYM",
	# "INTJ",
	"CCONJ", # "X",
	"NOUN", "DET",
	"PROPN", #"NUM",
	"VERB", #"PART",
	"PRON",
	"SCONJ"
	)

	icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
	rowwise() %>%
	mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)

	icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
	scale_fill_ids() + scale_color_ids() +
	theme_ids(base_size=24) +
	theme(
	axis.title.x = element_text(size = rel(1.5), face = "bold"),
	axis.title.y = element_text(size = rel(1.5), face = "bold"),
	axis.text = element_text(size = rel(1)),
	legend.title = element_text(size = rel(1), face = "bold"),
	legend.text = element_text(size = rel(1))) +
	geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=6.2, family="Fira Sans Condensed")
	```

	# Identification of Light Verb Constructions with take


	## English: take

	```{r take_icc, echo=TRUE, message=FALSE}
	take_ca_icc <-
	collocationAnalysis(
	icc_con("eng"),
	"focus({[ud/l=take]} [ud/p=NOUN])",
	leftContextSize = 0,
	rightContextSize = 1,
	minOccur = 2,
	addExamples = T
	)

	take_ca_icc %>% show_table()
	```

	# References