blob: 2b06d952d78aa1807d88ee7d05ed393f961852d7 [file] [log] [blame]
Marc Kupietz6dd348c2023-05-03 08:26:58 +02001library(RKorAPClient)
2library(tidyverse)
3library(purrrlyr)
4
5# The challenge in searching gender variants with KorAP and DeReKo is that,
6# firstly, some characters used for gender marking, especially punctuation marks,
7# are interpreted and indexed as token boundaries and, secondly, punctuation
8# marks are currently not indexed in KorAP.
9#
10# The former is intentional with regard to a majority of use cases and with
11# regard to the reproducibility maxim (see Diewald/Kupietz/Lüngen 2022).
12# The latter is a shortcoming in KorAP that will be remedied sooner or later
13# and that can be solved provisionally in the meantime with the help of the KorAP API.
14#
15# The following unravelPunctuationGenderCases function, for example, takes the
16# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
17# closely into the KWIC snippets to see which non-indexed strings actually do appear
18# between these tokens and counts the frequencies of the variants that occur.
19
20unravelPunctuationGenderCases <- function(df, suffix = "innen", kco = new("KorAPConnection", verbose=TRUE)) {
21 if ( nrow(df) > 1) {
22 df %>%
23 dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
24 by_row(unravelPunctuationGenderCases, .collate = "rows", .labels=FALSE) %>%
25 select(-.row) %>%
26 bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
27 tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
28 select(-f, -conf.low, -conf.high) %>%
29 RKorAPClient::ci() %>%
30 mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
31 mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
32 filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
33 } else {
34 q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
35 fetchAll()
36 cases <- q@collectedMatches$snippet %>%
37 str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
38 as_tibble() %>%
39 group_by(value) %>%
40 summarise(n = n())
41 df %>% uncount(nrow(cases)) %>%
42 mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
43 }
44}
45
46plotPluralGenderVariants <- function(word = "Nutzer",
47 years = c(1995:2022),
48 as.alternatives = FALSE,
49 vc = "referTo ratskorpus-2023-1 & pubDate in",
50 suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
51 prefixes = c('', '"', '"', ''),
52 kco = new("KorAPConnection", verbose=TRUE) ) {
53 hc <-
54 frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
55 unravelPunctuationGenderCases(kco = kco) %>%
56 hc_freq_by_year_ci(as.alternatives)
57 print(hc)
58 hc
59}
60
61
62hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
63# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
64
65# Diewald, Nils/Kupietz, Marc/Lüngen, Harald (2022):
66# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
67# In: Klosa-Kückelhaus, Annette/Engelberg, Stefan/Möhrs, Christine/Storjohann, Petra (eds):
68# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
69# Mannheim: IDS-Verlag, 2022: 208-221.
70# <https://doi.org/10.14618/ids-pub-11146>
71