Blame - R/report.Rmd - ICC/2023-07-20-ICC-ICLC10

blob: 48041150c2a9b19b486c2f6301398893c33283b2 [file] [log] [blame]

Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	1	---
				2	title: "ICC Written Launch"
				3	output:
				4	html_document:
				5	css: style.css
				6	self_contained: yes
				7	date: "`r Sys.Date()`"
				8	---
				9
				10	```{r setup, include=FALSE, echo=FALSE, warning=FALSE}
				11	knitr::opts_chunk$set(echo = FALSE, warnings = FALSE)
				12	source("common.R")
				13	```
				14
				15	# Actual composition of ICC parts
				16
				17	## Composition by ICC genre
				18
				19	```{r composition_by_genre, message = FALSE}
				20	icc_genre <- icc %>%
				21	expand_grid(genre) %>%
				22	mutate(vc = paste0("iccGenre=", genre)) %>%
				23	rowwise() %>%
				24	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
				25
				26	icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) +
				27	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
				28	theme_ids() +
Marc Kupietz	07645fb	2023-06-07 11:31:07 +0200	[diff] [blame]	29	scale_fill_ids() +
				30	geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	31
				32	```
				33
				34	## Composition by date of publication
				35
				36
Marc Kupietz	985e893	2023-06-07 17:48:59 +0200	[diff] [blame]	37	```{r composition_by_pubdate, message=F, warning=F}
				38	year <- c(1986:2023)
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	39
				40	icc_year <- icc %>%
				41	expand_grid(year) %>%
				42	mutate(vc = paste0("pubDate in ", year)) %>%
				43	rowwise() %>%
				44	mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens)
				45
				46	icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) +
Marc Kupietz	985e893	2023-06-07 17:48:59 +0200	[diff] [blame]	47	# geom_smooth(se=F, span=0.25) +
				48	xlim(1990, 2023) +
				49	ylim(0, NA) +
				50	stat_smooth(
				51	geom = 'area', method = 'loess', span = 1/4,
				52	alpha = 0.1) +
				53	# geom_area(alpha=0.1, position = "identity") +
				54	scale_fill_ids() + scale_colour_ids() +
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	55	scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
				56	theme_ids()
				57	```
				58
				59	## Part-of-Speech proportions
				60
				61	```{r pos_proportions}
				62	POS_tag <- c(
				63	"ADJ", "ADP",# "PUNCT",
				64	"ADV", "AUX", # "SYM",
Marc Kupietz	ddda028	2023-06-07 17:48:37 +0200	[diff] [blame]	65	# "INTJ",
				66	"CCONJ", # "X",
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	67	"NOUN", "DET",
				68	"PROPN", #"NUM",
				69	"VERB", #"PART",
				70	"PRON",
				71	"SCONJ"
				72	)
				73
				74	icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>%
				75	rowwise() %>%
				76	mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f)
				77
				78	icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) +
				79	geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) +
Marc Kupietz	d5540e9	2023-06-07 17:48:01 +0200	[diff] [blame]	80	scale_fill_ids() + scale_color_ids() +
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	81	theme_ids(base_size = 12) +
Marc Kupietz	d5540e9	2023-06-07 17:48:01 +0200	[diff] [blame]	82	geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed")
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	83	```
				84
				85	# Pilot study: Identification of Light Verb Constructions with take
				86
				87	```{r prepare_ca, output=FALSE, message=FALSE}
				88
				89
				90
				91
				92	```
				93
				94	## English: take
				95
				96	```{r take_icc, echo=TRUE, message=FALSE}
				97	take_ca_icc <-
				98	collocationAnalysis(
				99	icc_con("eng"),
				100	"focus({[ud/l=take]} [ud/p=NOUN])",
				101	leftContextSize = 0,
				102	rightContextSize = 1,
				103	minOccur = 2,
				104	addExamples = T
				105	)
				106
				107	take_ca_icc %>% show_table()
				108	```
				109
Marc Kupietz	fdac64b	2023-06-13 08:32:37 +0200	[diff] [blame]	110	### For comparison based on English Wikipedia
				111	#### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/))
				112
				113	```{r take_wpe, echo=TRUE}
				114	if (file.exists("../data/take_ca_wpe")) {
Marc Kupietz	d52056a	2023-06-26 20:38:03 +0200	[diff] [blame]	115	take_ca_wpe_i <- readRDS("../data/take_ca_wpe")
Marc Kupietz	fdac64b	2023-06-13 08:32:37 +0200	[diff] [blame]	116	} else {
				117	wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T)
Marc Kupietz	d52056a	2023-06-26 20:38:03 +0200	[diff] [blame]	118	take_ca_wpe_i <-
Marc Kupietz	fdac64b	2023-06-13 08:32:37 +0200	[diff] [blame]	119	collocationAnalysis(
				120	wpe,
				121	"focus({[tt/l=take]} [tt/p=NN])",
				122	leftContextSize = 0,
				123	rightContextSize = 1,
Marc Kupietz	d52056a	2023-06-26 20:38:03 +0200	[diff] [blame]	124	ignoreCollocateCase = TRUE,
Marc Kupietz	fdac64b	2023-06-13 08:32:37 +0200	[diff] [blame]	125	minOccur = 5,
				126	addExamples = T
				127	)
				128	}
Marc Kupietz	d52056a	2023-06-26 20:38:03 +0200	[diff] [blame]	129	take_ca_wpe_i %>% show_table(max=10000)
Marc Kupietz	fdac64b	2023-06-13 08:32:37 +0200	[diff] [blame]	130	```
				131
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	132	## German: nehmen
				133
				134	```{r nehmen_icc, echo=TRUE}
				135	nehmen_ca_icc <-
				136	collocationAnalysis(
				137	icc_con("ger"),
				138	"focus([tt/p=NN] {[tt/l=nehmen]})",
				139	leftContextSize = 1,
				140	rightContextSize = 0,
				141	minOccur = 2,
				142	addExamples = T
				143	)
				144	nehmen_ca_icc %>% show_table()
				145	```
				146
				147	### For comparison based on the whole DeReKo
				148
				149	```{r nehmen_dereko}
				150	nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples")
Marc Kupietz	d52056a	2023-06-26 20:38:03 +0200	[diff] [blame]	151	nehmen_ca_dereko %>% show__full_table()
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	152	```
				153
				154	## Norwegian: ta
				155
Marc Kupietz	d75a179	2023-06-07 17:47:23 +0200	[diff] [blame]	156	```{r ta_icc, echo=T, message=FALSE}
Marc Kupietz	6e21b10	2023-06-02 18:04:04 +0200	[diff] [blame]	157	ta_ca_icc <-
				158	collocationAnalysis(
				159	icc_con("nor"),
				160	"focus({[ud/l=ta]} [ud/p=NOUN])",
				161	leftContextSize = 0,
				162	rightContextSize = 1,
				163	minOccur = 2,
				164	addExamples = T
				165	)
				166	ta_ca_icc %>% show_table()
				167	```
				168