Add demo that plots plural gender variant frequencies over time Change-Id: If20630ed7190f5a7744c165e50196efd48c83090

commit: 6dd348cea71b871a8e4791b838492e91b2db0f53 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed May 03 08:26:58 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed May 10 14:55:59 2023 +0200
tree: cea692a0cdcafc0180fdf4e23105dca9278f935a
parent: ef0d172ec8c1df67109ee10e5fe1258bd9f2c98c [diff] [blame]
diff --git a/demo/pluralGenderVariants.R b/demo/pluralGenderVariants.R
new file mode 100644
index 0000000..2b06d95
--- /dev/null
+++ b/demo/pluralGenderVariants.R

@@ -0,0 +1,71 @@
+library(RKorAPClient)
+library(tidyverse)
+library(purrrlyr)
+
+# The challenge in searching gender variants with KorAP and DeReKo is that,
+# firstly, some characters used for gender marking, especially punctuation marks,
+# are interpreted and indexed as token boundaries and, secondly, punctuation
+# marks are currently not indexed in KorAP.
+#
+# The former is intentional with regard to a majority of use cases and with
+# regard to the reproducibility maxim (see Diewald/Kupietz/Lüngen 2022).
+# The latter is a shortcoming in KorAP that will be remedied sooner or later
+# and that can be solved provisionally in the meantime with the help of the KorAP API.
+#
+# The following unravelPunctuationGenderCases function, for example, takes the
+# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
+# closely into the KWIC snippets to see which non-indexed strings actually do appear
+# between these tokens and counts the frequencies of the variants that occur.
+
+unravelPunctuationGenderCases <- function(df, suffix = "innen", kco = new("KorAPConnection", verbose=TRUE)) {
+  if ( nrow(df) > 1) {
+    df %>%
+      dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
+      by_row(unravelPunctuationGenderCases, .collate = "rows", .labels=FALSE) %>%
+      select(-.row) %>%
+      bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
+      tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0))  %>%
+      select(-f, -conf.low, -conf.high) %>%
+      RKorAPClient::ci() %>%
+      mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
+      mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
+      filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
+  } else {
+    q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
+      fetchAll()
+    cases <- q@collectedMatches$snippet %>%
+      str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
+      as_tibble() %>%
+      group_by(value) %>%
+      summarise(n = n())
+    df %>% uncount(nrow(cases)) %>%
+      mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
+  }
+}
+
+plotPluralGenderVariants <- function(word = "Nutzer",
+                          years = c(1995:2022),
+                          as.alternatives = FALSE,
+                          vc = "referTo ratskorpus-2023-1 & pubDate in",
+                          suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
+                          prefixes = c('',      '"',            '"',        ''),
+                          kco = new("KorAPConnection", verbose=TRUE) ) {
+  hc <-
+    frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
+    unravelPunctuationGenderCases(kco = kco) %>%
+    hc_freq_by_year_ci(as.alternatives)
+  print(hc)
+  hc
+}
+
+
+hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)
+# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)
+
+# Diewald, Nils/Kupietz, Marc/Lüngen, Harald (2022):
+# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
+# In: Klosa-Kückelhaus, Annette/Engelberg, Stefan/Möhrs, Christine/Storjohann, Petra (eds):
+# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
+# Mannheim: IDS-Verlag, 2022: 208-221.
+# <https://doi.org/10.14618/ids-pub-11146>
+
commit	6dd348cea71b871a8e4791b838492e91b2db0f53	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed May 03 08:26:58 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed May 10 14:55:59 2023 +0200
tree	cea692a0cdcafc0180fdf4e23105dca9278f935a
parent	ef0d172ec8c1df67109ee10e5fe1258bd9f2c98c [diff] [blame]