Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame^] | 1 | --- |
| 2 | title: "ICC Written Launch" |
| 3 | output: |
| 4 | html_document: |
| 5 | css: style.css |
| 6 | self_contained: yes |
| 7 | date: "`r Sys.Date()`" |
| 8 | --- |
| 9 | |
| 10 | ```{r setup, include=FALSE, echo=FALSE, warning=FALSE} |
| 11 | knitr::opts_chunk$set(echo = FALSE, warnings = FALSE) |
| 12 | source("common.R") |
| 13 | ``` |
| 14 | |
| 15 | # Actual composition of ICC parts |
| 16 | |
| 17 | ## Composition by ICC genre |
| 18 | |
| 19 | ```{r composition_by_genre, message = FALSE} |
| 20 | icc_genre <- icc %>% |
| 21 | expand_grid(genre) %>% |
| 22 | mutate(vc = paste0("iccGenre=", genre)) %>% |
| 23 | rowwise() %>% |
| 24 | mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| 25 | |
| 26 | icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) + |
| 27 | geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| 28 | theme_ids() + |
| 29 | geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed") |
| 30 | |
| 31 | ``` |
| 32 | |
| 33 | ## Composition by date of publication |
| 34 | |
| 35 | |
| 36 | ```{r composition_by_pubdate} |
| 37 | year <- c(1988:2023) |
| 38 | |
| 39 | icc_year <- icc %>% |
| 40 | expand_grid(year) %>% |
| 41 | mutate(vc = paste0("pubDate in ", year)) %>% |
| 42 | rowwise() %>% |
| 43 | mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| 44 | |
| 45 | icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) + |
| 46 | geom_line() + |
| 47 | geom_point() + |
| 48 | scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| 49 | theme_ids() |
| 50 | ``` |
| 51 | |
| 52 | ## Part-of-Speech proportions |
| 53 | |
| 54 | ```{r pos_proportions} |
| 55 | POS_tag <- c( |
| 56 | "ADJ", "ADP",# "PUNCT", |
| 57 | "ADV", "AUX", # "SYM", |
| 58 | "INTJ", "CCONJ", # "X", |
| 59 | "NOUN", "DET", |
| 60 | "PROPN", #"NUM", |
| 61 | "VERB", #"PART", |
| 62 | "PRON", |
| 63 | "SCONJ" |
| 64 | ) |
| 65 | |
| 66 | icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>% |
| 67 | rowwise() %>% |
| 68 | mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f) |
| 69 | |
| 70 | icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) + |
| 71 | geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| 72 | theme_ids(base_size = 12) + |
| 73 | geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="white", size=3.2, family="Fira Sans Condensed") |
| 74 | ``` |
| 75 | |
| 76 | # Pilot study: Identification of Light Verb Constructions with *take* |
| 77 | |
| 78 | ```{r prepare_ca, output=FALSE, message=FALSE} |
| 79 | |
| 80 | |
| 81 | |
| 82 | |
| 83 | ``` |
| 84 | |
| 85 | ## English: *take* |
| 86 | |
| 87 | ```{r take_icc, echo=TRUE, message=FALSE} |
| 88 | take_ca_icc <- |
| 89 | collocationAnalysis( |
| 90 | icc_con("eng"), |
| 91 | "focus({[ud/l=take]} [ud/p=NOUN])", |
| 92 | leftContextSize = 0, |
| 93 | rightContextSize = 1, |
| 94 | minOccur = 2, |
| 95 | addExamples = T |
| 96 | ) |
| 97 | |
| 98 | take_ca_icc %>% show_table() |
| 99 | ``` |
| 100 | |
| 101 | ## German: *nehmen* |
| 102 | |
| 103 | ```{r nehmen_icc, echo=TRUE} |
| 104 | nehmen_ca_icc <- |
| 105 | collocationAnalysis( |
| 106 | icc_con("ger"), |
| 107 | "focus([tt/p=NN] {[tt/l=nehmen]})", |
| 108 | leftContextSize = 1, |
| 109 | rightContextSize = 0, |
| 110 | minOccur = 2, |
| 111 | addExamples = T |
| 112 | ) |
| 113 | nehmen_ca_icc %>% show_table() |
| 114 | ``` |
| 115 | |
| 116 | ### For comparison based on the whole DeReKo |
| 117 | |
| 118 | ```{r nehmen_dereko} |
| 119 | nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples") |
| 120 | nehmen_ca_dereko %>% show_table() |
| 121 | ``` |
| 122 | |
| 123 | ## Norwegian: *ta* |
| 124 | |
| 125 | # ```{r ta_icc, echo=TRUE, message=FALSE} |
| 126 | ta_ca_icc <- |
| 127 | collocationAnalysis( |
| 128 | icc_con("nor"), |
| 129 | "focus({[ud/l=ta]} [ud/p=NOUN])", |
| 130 | leftContextSize = 0, |
| 131 | rightContextSize = 1, |
| 132 | minOccur = 2, |
| 133 | addExamples = T |
| 134 | ) |
| 135 | ta_ca_icc %>% show_table() |
| 136 | ``` |
| 137 | |