Blame - R/report.Rmd - ICC/2023-07-20-ICC-ICLC10

blob: 5f28216498374c49e200eb14c81f61ecf8b8d59a [file] [log] [blame]

Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame^]	1	---
				2	title: "ICC Written Launch"
				3	output:
				4	html_document:
				5	css: style.css
				6	self_contained: yes
				7	date: "`r Sys.Date()`"
				8	---
				9
				10	```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
				11	knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
				12	source("common.R")
				13	```
				14
				15	# Actual composition of ICC parts
				16
				17	## Composition by ICC genre
				18
				19	```{r composition_by_genre, message = FALSE}
				20	icc_genre <- icc %>%
				21	expand_grid(genre) %>%
				22	mutate(vc = paste0("iccGenre=", genre)) %>%
				23	rowwise() %>%
				24	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
				25
				26	icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
				27	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
				28	theme_ids() +
				29	geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
				30
				31	```
				32
				33	## Composition by date of publication
				34
				35
				36	```{r composition_by_pubdate}
				37	year <- c(1988:2023)
				38
				39	icc_year <- icc %>%
				40	expand_grid(year) %>%
				41	mutate(vc = paste0("pubDate in ", year)) %>%
				42	rowwise() %>%
				43	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
				44
				45	icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
				46	geom_line() +
				47	geom_point() +
				48	scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
				49	theme_ids()
				50	```
				51
				52	## Part-of-Speech proportions
				53
				54	```{r pos_proportions}
				55	POS_tag <- c(
				56	"ADJ", "ADP",# "PUNCT",
				57	"ADV", "AUX", # "SYM",
				58	"INTJ", "CCONJ", # "X",
				59	"NOUN", "DET",
				60	"PROPN", #"NUM",
				61	"VERB", #"PART",
				62	"PRON",
				63	"SCONJ"
				64	)
				65
				66	icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
				67	rowwise() %>%
				68	mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
				69
				70	icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
				71	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
				72	theme_ids(base_size = 12) +
				73	geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed")
				74	```
				75
				76	# Pilot study: Identification of Light Verb Constructions with take
				77
				78	```{r prepare_ca, output=FALSE, message=FALSE}
				79
				80
				81
				82
				83	```
				84
				85	## English: take
				86
				87	```{r take_icc, echo=TRUE, message=FALSE}
				88	take_ca_icc <-
				89	collocationAnalysis(
				90	icc_con("eng"),
				91	"focus({[ud/l=take]} [ud/p=NOUN])",
				92	leftContextSize = 0,
				93	rightContextSize = 1,
				94	minOccur = 2,
				95	addExamples = T
				96	)
				97
				98	take_ca_icc %>% show_table()
				99	```
				100
				101	## German: nehmen
				102
				103	```{r nehmen_icc, echo=TRUE}
				104	nehmen_ca_icc <-
				105	collocationAnalysis(
				106	icc_con("ger"),
				107	"focus([tt/p=NN] {[tt/l=nehmen]})",
				108	leftContextSize = 1,
				109	rightContextSize = 0,
				110	minOccur = 2,
				111	addExamples = T
				112	)
				113	nehmen_ca_icc %>% show_table()
				114	```
				115
				116	### For comparison based on the whole DeReKo
				117
				118	```{r nehmen_dereko}
				119	nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
				120	nehmen_ca_dereko %>% show_table()
				121	```
				122
				123	## Norwegian: ta
				124
				125	# ```{r ta_icc, echo=TRUE, message=FALSE}
				126	ta_ca_icc <-
				127	collocationAnalysis(
				128	icc_con("nor"),
				129	"focus({[ud/l=ta]} [ud/p=NOUN])",
				130	leftContextSize = 0,
				131	rightContextSize = 1,
				132	minOccur = 2,
				133	addExamples = T
				134	)
				135	ta_ca_icc %>% show_table()
				136	```
				137