Marc Kupietz | 6dd348c | 2023-05-03 08:26:58 +0200 | [diff] [blame^] | 1 | library(RKorAPClient) |
| 2 | library(tidyverse) |
| 3 | library(purrrlyr) |
| 4 | |
| 5 | # The challenge in searching gender variants with KorAP and DeReKo is that, |
| 6 | # firstly, some characters used for gender marking, especially punctuation marks, |
| 7 | # are interpreted and indexed as token boundaries and, secondly, punctuation |
| 8 | # marks are currently not indexed in KorAP. |
| 9 | # |
| 10 | # The former is intentional with regard to a majority of use cases and with |
| 11 | # regard to the reproducibility maxim (see Diewald/Kupietz/Lüngen 2022). |
| 12 | # The latter is a shortcoming in KorAP that will be remedied sooner or later |
| 13 | # and that can be solved provisionally in the meantime with the help of the KorAP API. |
| 14 | # |
| 15 | # The following unravelPunctuationGenderCases function, for example, takes the |
| 16 | # result of a frequencyQuery for two supposedly consecutive tokens and then looks more |
| 17 | # closely into the KWIC snippets to see which non-indexed strings actually do appear |
| 18 | # between these tokens and counts the frequencies of the variants that occur. |
| 19 | |
| 20 | unravelPunctuationGenderCases <- function(df, suffix = "innen", kco = new("KorAPConnection", verbose=TRUE)) { |
| 21 | if ( nrow(df) > 1) { |
| 22 | df %>% |
| 23 | dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>% |
| 24 | by_row(unravelPunctuationGenderCases, .collate = "rows", .labels=FALSE) %>% |
| 25 | select(-.row) %>% |
| 26 | bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>% |
| 27 | tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>% |
| 28 | select(-f, -conf.low, -conf.high) %>% |
| 29 | RKorAPClient::ci() %>% |
| 30 | mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>% |
| 31 | mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>% |
| 32 | filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen" |
| 33 | } else { |
| 34 | q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>% |
| 35 | fetchAll() |
| 36 | cases <- q@collectedMatches$snippet %>% |
| 37 | str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>% |
| 38 | as_tibble() %>% |
| 39 | group_by(value) %>% |
| 40 | summarise(n = n()) |
| 41 | df %>% uncount(nrow(cases)) %>% |
| 42 | mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n) |
| 43 | } |
| 44 | } |
| 45 | |
| 46 | plotPluralGenderVariants <- function(word = "Nutzer", |
| 47 | years = c(1995:2022), |
| 48 | as.alternatives = FALSE, |
| 49 | vc = "referTo ratskorpus-2023-1 & pubDate in", |
| 50 | suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'), |
| 51 | prefixes = c('', '"', '"', ''), |
| 52 | kco = new("KorAPConnection", verbose=TRUE) ) { |
| 53 | hc <- |
| 54 | frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>% |
| 55 | unravelPunctuationGenderCases(kco = kco) %>% |
| 56 | hc_freq_by_year_ci(as.alternatives) |
| 57 | print(hc) |
| 58 | hc |
| 59 | } |
| 60 | |
| 61 | |
| 62 | hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE) |
| 63 | # htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE) |
| 64 | |
| 65 | # Diewald, Nils/Kupietz, Marc/Lüngen, Harald (2022): |
| 66 | # Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level. |
| 67 | # In: Klosa-Kückelhaus, Annette/Engelberg, Stefan/Möhrs, Christine/Storjohann, Petra (eds): |
| 68 | # Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022. |
| 69 | # Mannheim: IDS-Verlag, 2022: 208-221. |
| 70 | # <https://doi.org/10.14618/ids-pub-11146> |
| 71 | |