Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 1 | --- |
| 2 | title: "ICC Written Launch" |
| 3 | output: |
| 4 | html_document: |
| 5 | css: style.css |
| 6 | self_contained: yes |
| 7 | date: "`r Sys.Date()`" |
| 8 | --- |
| 9 | |
| 10 | ```{r setup, include=FALSE, echo=FALSE, warning=FALSE} |
| 11 | knitr::opts_chunk$set(echo = FALSE, warnings = FALSE) |
| 12 | source("common.R") |
| 13 | ``` |
| 14 | |
| 15 | # Actual composition of ICC parts |
| 16 | |
| 17 | ## Composition by ICC genre |
| 18 | |
| 19 | ```{r composition_by_genre, message = FALSE} |
| 20 | icc_genre <- icc %>% |
| 21 | expand_grid(genre) %>% |
| 22 | mutate(vc = paste0("iccGenre=", genre)) %>% |
| 23 | rowwise() %>% |
| 24 | mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| 25 | |
| 26 | icc_genre %>% ggplot(aes(x=lang, fill=genre, y=tokens)) + |
| 27 | geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| 28 | theme_ids() + |
Marc Kupietz | 07645fb | 2023-06-07 11:31:07 +0200 | [diff] [blame] | 29 | scale_fill_ids() + |
| 30 | geom_text(aes(label=if_else(tokens > 0, as.character(tokens), ""), y=tokens), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed") |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 31 | |
| 32 | ``` |
| 33 | |
| 34 | ## Composition by date of publication |
| 35 | |
| 36 | |
Marc Kupietz | 985e893 | 2023-06-07 17:48:59 +0200 | [diff] [blame] | 37 | ```{r composition_by_pubdate, message=F, warning=F} |
| 38 | year <- c(1986:2023) |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 39 | |
| 40 | icc_year <- icc %>% |
| 41 | expand_grid(year) %>% |
| 42 | mutate(vc = paste0("pubDate in ", year)) %>% |
| 43 | rowwise() %>% |
| 44 | mutate(tokens= corpusStats(icc_con(lang, token), vc = vc)@tokens) |
| 45 | |
| 46 | icc_year %>% ggplot(aes(x=year, fill=lang, color=lang, y=tokens)) + |
Marc Kupietz | 985e893 | 2023-06-07 17:48:59 +0200 | [diff] [blame] | 47 | # geom_smooth(se=F, span=0.25) + |
| 48 | xlim(1990, 2023) + |
| 49 | ylim(0, NA) + |
| 50 | stat_smooth( |
| 51 | geom = 'area', method = 'loess', span = 1/4, |
| 52 | alpha = 0.1) + |
| 53 | # geom_area(alpha=0.1, position = "identity") + |
| 54 | scale_fill_ids() + scale_colour_ids() + |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 55 | scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
| 56 | theme_ids() |
| 57 | ``` |
| 58 | |
| 59 | ## Part-of-Speech proportions |
| 60 | |
| 61 | ```{r pos_proportions} |
| 62 | POS_tag <- c( |
| 63 | "ADJ", "ADP",# "PUNCT", |
| 64 | "ADV", "AUX", # "SYM", |
Marc Kupietz | ddda028 | 2023-06-07 17:48:37 +0200 | [diff] [blame] | 65 | # "INTJ", |
| 66 | "CCONJ", # "X", |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 67 | "NOUN", "DET", |
| 68 | "PROPN", #"NUM", |
| 69 | "VERB", #"PART", |
| 70 | "PRON", |
| 71 | "SCONJ" |
| 72 | ) |
| 73 | |
| 74 | icc_by_pos_tag <- icc %>% expand_grid(POS = POS_tag) %>% |
| 75 | rowwise() %>% |
| 76 | mutate(f = frequencyQuery(icc_con(lang), sprintf("[ud/p=%s]", POS))$f) |
| 77 | |
| 78 | icc_by_pos_tag %>% ggplot(aes(x=lang, fill = POS, y=f)) + |
| 79 | geom_col() + scale_y_continuous(labels = label_number(scale_cut = cut_short_scale())) + |
Marc Kupietz | d5540e9 | 2023-06-07 17:48:01 +0200 | [diff] [blame] | 80 | scale_fill_ids() + scale_color_ids() + |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 81 | theme_ids(base_size = 12) + |
Marc Kupietz | d5540e9 | 2023-06-07 17:48:01 +0200 | [diff] [blame] | 82 | geom_text(aes(label=sprintf("%.2f%%", 100*f), y=f), position= position_stack(reverse = F, vjust = 0.5), color="black", size=3.2, family="Fira Sans Condensed") |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 83 | ``` |
| 84 | |
| 85 | # Pilot study: Identification of Light Verb Constructions with *take* |
| 86 | |
| 87 | ```{r prepare_ca, output=FALSE, message=FALSE} |
| 88 | |
| 89 | |
| 90 | |
| 91 | |
| 92 | ``` |
| 93 | |
| 94 | ## English: *take* |
| 95 | |
| 96 | ```{r take_icc, echo=TRUE, message=FALSE} |
| 97 | take_ca_icc <- |
| 98 | collocationAnalysis( |
| 99 | icc_con("eng"), |
| 100 | "focus({[ud/l=take]} [ud/p=NOUN])", |
| 101 | leftContextSize = 0, |
| 102 | rightContextSize = 1, |
| 103 | minOccur = 2, |
| 104 | addExamples = T |
| 105 | ) |
| 106 | |
| 107 | take_ca_icc %>% show_table() |
| 108 | ``` |
| 109 | |
Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 110 | ### For comparison based on English Wikipedia |
| 111 | #### (Snapshot from 2015 with 2.4 billion words, see [here](https://www.ids-mannheim.de/digspra/kl/projekte/korpora/verfuegbarkeit/)) |
| 112 | |
| 113 | ```{r take_wpe, echo=TRUE} |
| 114 | if (file.exists("../data/take_ca_wpe")) { |
Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 115 | take_ca_wpe_i <- readRDS("../data/take_ca_wpe") |
Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 116 | } else { |
| 117 | wpe <- new("KorAPConnection", "https://korap.ids-mannheim.de/instance/english", verbose=T) |
Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 118 | take_ca_wpe_i <- |
Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 119 | collocationAnalysis( |
| 120 | wpe, |
| 121 | "focus({[tt/l=take]} [tt/p=NN])", |
| 122 | leftContextSize = 0, |
| 123 | rightContextSize = 1, |
Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 124 | ignoreCollocateCase = TRUE, |
Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 125 | minOccur = 5, |
| 126 | addExamples = T |
| 127 | ) |
| 128 | } |
Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 129 | take_ca_wpe_i %>% show_table(max=10000) |
Marc Kupietz | fdac64b | 2023-06-13 08:32:37 +0200 | [diff] [blame] | 130 | ``` |
| 131 | |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 132 | ## German: *nehmen* |
| 133 | |
| 134 | ```{r nehmen_icc, echo=TRUE} |
| 135 | nehmen_ca_icc <- |
| 136 | collocationAnalysis( |
| 137 | icc_con("ger"), |
| 138 | "focus([tt/p=NN] {[tt/l=nehmen]})", |
| 139 | leftContextSize = 1, |
| 140 | rightContextSize = 0, |
| 141 | minOccur = 2, |
| 142 | addExamples = T |
| 143 | ) |
| 144 | nehmen_ca_icc %>% show_table() |
| 145 | ``` |
| 146 | |
| 147 | ### For comparison based on the whole DeReKo |
| 148 | |
| 149 | ```{r nehmen_dereko} |
| 150 | nehmen_ca_dereko <- readRDS("../data/ca_nehmen_dereko_examples") |
Marc Kupietz | d52056a | 2023-06-26 20:38:03 +0200 | [diff] [blame] | 151 | nehmen_ca_dereko %>% show__full_table() |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 152 | ``` |
| 153 | |
| 154 | ## Norwegian: *ta* |
| 155 | |
Marc Kupietz | d75a179 | 2023-06-07 17:47:23 +0200 | [diff] [blame] | 156 | ```{r ta_icc, echo=T, message=FALSE} |
Marc Kupietz | 6e21b10 | 2023-06-02 18:04:04 +0200 | [diff] [blame] | 157 | ta_ca_icc <- |
| 158 | collocationAnalysis( |
| 159 | icc_con("nor"), |
| 160 | "focus({[ud/l=ta]} [ud/p=NOUN])", |
| 161 | leftContextSize = 0, |
| 162 | rightContextSize = 1, |
| 163 | minOccur = 2, |
| 164 | addExamples = T |
| 165 | ) |
| 166 | ta_ca_icc %>% show_table() |
| 167 | ``` |
| 168 | |