Add demo that plots plural gender variant frequencies over time
Change-Id: If20630ed7190f5a7744c165e50196efd48c83090
diff --git a/demo/pluralGenderVariants.R b/demo/pluralGenderVariants.R
new file mode 100644
index 0000000..2b06d95
--- /dev/null
+++ b/demo/pluralGenderVariants.R
@@ -0,0 +1,71 @@
+library(RKorAPClient)
+library(tidyverse)
+library(purrrlyr)
+
+# The challenge in searching gender variants with KorAP and DeReKo is that,
+# firstly, some characters used for gender marking, especially punctuation marks,
+# are interpreted and indexed as token boundaries and, secondly, punctuation
+# marks are currently not indexed in KorAP.
+#
+# The former is intentional with regard to a majority of use cases and with
+# regard to the reproducibility maxim (see Diewald/Kupietz/Lüngen 2022).
+# The latter is a shortcoming in KorAP that will be remedied sooner or later
+# and that can be solved provisionally in the meantime with the help of the KorAP API.
+#
+# The following unravelPunctuationGenderCases function, for example, takes the
+# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
+# closely into the KWIC snippets to see which non-indexed strings actually do appear
+# between these tokens and counts the frequencies of the variants that occur.
+
+unravelPunctuationGenderCases <- function(df, suffix = "innen", kco = new("KorAPConnection", verbose=TRUE)) {
+ if ( nrow(df) > 1) {
+ df %>%
+ dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
+ by_row(unravelPunctuationGenderCases, .collate = "rows", .labels=FALSE) %>%
+ select(-.row) %>%
+ bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
+ tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
+ select(-f, -conf.low, -conf.high) %>%
+ RKorAPClient::ci() %>%
+ mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
+ mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
+ filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
+ } else {
+ q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
+ fetchAll()
+ cases <- q@collectedMatches$snippet %>%
+ str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
+ as_tibble() %>%
+ group_by(value) %>%
+ summarise(n = n())
+ df %>% uncount(nrow(cases)) %>%
+ mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
+ }
+}
+
+plotPluralGenderVariants <- function(word = "Nutzer",
+ years = c(1995:2022),
+ as.alternatives = FALSE,
+ vc = "referTo ratskorpus-2023-1 & pubDate in",
+ suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
+ prefixes = c('', '"', '"', ''),
+ kco = new("KorAPConnection", verbose=TRUE) ) {
+ hc <-
+ frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
+ unravelPunctuationGenderCases(kco = kco) %>%
+ hc_freq_by_year_ci(as.alternatives)
+ print(hc)
+ hc
+}
+
+
+hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
+# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
+
+# Diewald, Nils/Kupietz, Marc/Lüngen, Harald (2022):
+# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
+# In: Klosa-Kückelhaus, Annette/Engelberg, Stefan/Möhrs, Christine/Storjohann, Petra (eds):
+# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
+# Mannheim: IDS-Verlag, 2022: 208-221.
+# <https://doi.org/10.14618/ids-pub-11146>
+