| --- |
| title: "ICC Written Launch" |
| output: |
| html_document: |
| css: style.css |
| self_contained: yes |
| date: "`r Sys.Date()`" |
| --- |
| |
| ```{r setup, include=FALSE, echo=FALSE, warning=FALSE} |
| knitr::opts_chunk$set(echo = FALSE, warnings = FALSE) |
| source("common.R") |
| ``` |
| |
| # Actual composition of ICC parts |
| |
| ## Composition by ICC genre |
| |
| ```{r composition_by_genre, message = FALSE} |
| icc_genre <- icc %>% |
| expand_grid(genre) %>% |
| mutate(vc = paste0("iccGenre=", genre)) %>% |
| rowwise() %>% |
| mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| |
| icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) + |
| geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| theme_ids() + |
| scale_fill_ids() + |
| geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed") |
| |
| ``` |
| |
| ## Composition by date of publication |
| |
| |
| ```{r composition_by_pubdate} |
| year <- c(1988:2023) |
| |
| icc_year <- icc %>% |
| expand_grid(year) %>% |
| mutate(vc = paste0("pubDate in ", year)) %>% |
| rowwise() %>% |
| mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| |
| icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) + |
| geom_line() + |
| geom_point() + |
| scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| theme_ids() |
| ``` |
| |
| ## Part-of-Speech proportions |
| |
| ```{r pos_proportions} |
| POS_tag <- c( |
| "ADJ", "ADP",# "PUNCT", |
| "ADV", "AUX", # "SYM", |
| "INTJ", "CCONJ", # "X", |
| "NOUN", "DET", |
| "PROPN", #"NUM", |
| "VERB", #"PART", |
| "PRON", |
| "SCONJ" |
| ) |
| |
| icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>% |
| rowwise() %>% |
| mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f) |
| |
| icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) + |
| geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| theme_ids(base_size = 12) + |
| geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed") |
| ``` |
| |
| # Pilot study: Identification of Light Verb Constructions with *take* |
| |
| ```{r prepare_ca, output=FALSE, message=FALSE} |
| |
| |
| |
| |
| ``` |
| |
| ## English: *take* |
| |
| ```{r take_icc, echo=TRUE, message=FALSE} |
| take_ca_icc <- |
| collocationAnalysis( |
| icc_con("eng"), |
| "focus({[ud/l=take]} [ud/p=NOUN])", |
| leftContextSize = 0, |
| rightContextSize = 1, |
| minOccur = 2, |
| addExamples = T |
| ) |
| |
| take_ca_icc %>% show_table() |
| ``` |
| |
| ## German: *nehmen* |
| |
| ```{r nehmen_icc, echo=TRUE} |
| nehmen_ca_icc <- |
| collocationAnalysis( |
| icc_con("ger"), |
| "focus([tt/p=NN] {[tt/l=nehmen]})", |
| leftContextSize = 1, |
| rightContextSize = 0, |
| minOccur = 2, |
| addExamples = T |
| ) |
| nehmen_ca_icc %>% show_table() |
| ``` |
| |
| ### For comparison based on the whole DeReKo |
| |
| ```{r nehmen_dereko} |
| nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples") |
| nehmen_ca_dereko %>% show_table() |
| ``` |
| |
| ## Norwegian: *ta* |
| |
| # ```{r ta_icc, echo=TRUE, message=FALSE} |
| ta_ca_icc <- |
| collocationAnalysis( |
| icc_con("nor"), |
| "focus({[ud/l=ta]} [ud/p=NOUN])", |
| leftContextSize = 0, |
| rightContextSize = 1, |
| minOccur = 2, |
| addExamples = T |
| ) |
| ta_ca_icc %>% show_table() |
| ``` |
| |