demo/pluralGenderVariants.R - KorAP/RKorAPClient - Gitiles

 library(RKorAPClient)
 library(tidyverse)
 library(purrrlyr)
 library(httr2)
 library(httpuv)
 library(RMariaDB)
 library(tidyfst)

 demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
 source("https://www.stgries.info/research/dispersion/dispersions.r")

 # The challenge in searching gender variants with KorAP and DeReKo is that,
 # firstly, some characters used for gender marking, especially punctuation marks,
 # are interpreted and indexed as token boundaries and, secondly, punctuation
 # marks are currently not indexed in KorAP.
 #
 # The former is intentional with regard to a majority of use cases and with
 # regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022).
 # The latter is a shortcoming in KorAP that will be remedied sooner or later
 # and that can be solved provisionally in the meantime with the help of the KorAP API.
 #
 # The following unravelPunctuationGenderCases function, for example, takes the
 # result of a frequencyQuery for two supposedly consecutive tokens and then looks more
 # closely into the KWIC snippets to see which non-indexed strings actually do appear
 # between these tokens and counts the frequencies of the variants that occur.

 unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) {
   if ( nrow(df) > 1) {
     df %>%
       dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
       by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>%
       select(-.row) %>%
       bind_rows(df %>% dplyr::filter(totalResults == 0 | ! str_detect(query, paste0(" ", suffix)))) %>%
       tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0))  %>%
       select(-f, -conf.low, -conf.high) %>%
       RKorAPClient::ci() %>%
       mutate(query = str_replace_all(query, '(^"|"$|[\\[\\]\\\\])', '')) %>%
       mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
       filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
   } else {
     q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
       fetchAll()
     cases <- q@collectedMatches$snippet %>%
       str_replace_all(paste0(".*<mark>.*\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
       as_tibble() %>%
       group_by(value) %>%
       summarise(n = n())
     df %>% uncount(nrow(cases)) %>%
       mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
   }
 }

 oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) {
     if (is.null(kco@accessToken) || is.null(kco@welcome)) { # if access token is not set or invalid
       kco@accessToken <- ( # request one
         oauth_client(
           id =  app_id, # for the demo application
           token_url = paste0(kco@apiUrl, "oauth2/token")
         ) %>%
           oauth_flow_auth_code(
             scope = "search match_info",
             auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize")
           )
       )$access_token
     }
     kco
 }

 plotPluralGenderVariants <- function(word = "Nutzer",
                           years = c(1995:2022),
                           as.alternatives = FALSE,
                           vc = "referTo ratskorpus-2023-1 & pubDate in",
                           suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
                           prefixes = c('',      '"',            '"',        ''),
                           kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
   hc <-
     frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
     unravelPunctuationGenderCases(kco = kco) %>%
     hc_freq_by_year_ci(as.alternatives)
   print(hc)
   hc
 }


 hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)

 getOKKSourceTitles <- function() {
   db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora")
   dbExecute(db, "SET NAMES 'utf8'")
   rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr")
   corpus_parts  <- dbFetch(rs)
   dbClearResult(rs)
   dbDisconnect(db)
   return(corpus_parts$title)
 }

 sourceDispersions <- function(word = "Bürger",
                              sourceTitles = getOKKSourceTitles(),
                              as.alternatives = FALSE,
                              vc = 'referTo ratskorpus-2023-1 & corpusTitle="',
                              suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
                              prefixes = c('',      '"',            '"',        ''),
                              kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
   df <-
     frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>%
     unravelPunctuationGenderCases(kco = kco) %>%
     mutate(Quelle=str_replace_all(.$vc, '(.*="|"$)', '')) %>%
     ipm() %>%
     filter(total > 0) %>%
     rename(Variante=query)

   dispersions <- df %>%
     group_by(Variante) %>%
     mutate(total_size=sum(total), rel_size=total/total_size) %>%
     group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>%
     pivot_longer(cols= -1) %>%
     filter(! str_detect(name, " equally")) %>%
     mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>%
     mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>%
     mutate(across(where(is.double)), value = round(value, 2)) %>%
     mutate_when(str_detect(name, "(corpus|range)"), value = as.integer(value)) %>%
     group_by(name) %>%
     mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>%
     group_by(Variante) %>%
     #    mutate(rank = mean(rank, na.rm = FALSE)) %>%
     bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>%
     select(-rank) %>%
     pivot_wider(names_from = Variante) %>%
     rename(measure=name)

   return(list(df, dispersions))
 }

 df_dispersions <- sourceDispersions("Nutzer")
 View(df_dispersions[[2]])

 # htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)

 # Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):
 # Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
 # In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds):
 # Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
 # Mannheim: IDS-Verlag, 2022: 208-221.
 # <https://doi.org/10.14618/ids-pub-11146>
	library(RKorAPClient)
	library(tidyverse)
	library(purrrlyr)
	library(httr2)
	library(httpuv)
	library(RMariaDB)
	library(tidyfst)

	demo_kor_app_id = "773NHGM76N7P9b6rLfmpM4"
	source("https://www.stgries.info/research/dispersion/dispersions.r")

	# The challenge in searching gender variants with KorAP and DeReKo is that,
	# firstly, some characters used for gender marking, especially punctuation marks,
	# are interpreted and indexed as token boundaries and, secondly, punctuation
	# marks are currently not indexed in KorAP.
	#
	# The former is intentional with regard to a majority of use cases and with
	# regard to the reproducibility maxim (see Diewald/Kupietz/L\u00FCngen 2022).
	# The latter is a shortcoming in KorAP that will be remedied sooner or later
	# and that can be solved provisionally in the meantime with the help of the KorAP API.
	#
	# The following unravelPunctuationGenderCases function, for example, takes the
	# result of a frequencyQuery for two supposedly consecutive tokens and then looks more
	# closely into the KWIC snippets to see which non-indexed strings actually do appear
	# between these tokens and counts the frequencies of the variants that occur.

	unravelPunctuationGenderCases <- function(df, suffix = "innen", kco) {
	if ( nrow(df) > 1) {
	df %>%
	dplyr::filter(totalResults > 0 & str_detect(query, paste0(" ", suffix))) %>%
	by_row(unravelPunctuationGenderCases, kco = kco, .collate = "rows", .labels=FALSE) %>%
	select(-.row) %>%
	bind_rows(df %>% dplyr::filter(totalResults == 0 \| ! str_detect(query, paste0(" ", suffix)))) %>%
	tidyr::complete(query, nesting(vc, total), fill = list(totalResults = 0)) %>%
	select(-f, -conf.low, -conf.high) %>%
	RKorAPClient::ci() %>%
	mutate(query = str_replace_all(query, '(^"\|"$\|[\\[\\]\\\\])', '')) %>%
	mutate(query = str_replace_all(query, paste0('\\(', suffix), paste0('(', suffix, ')'))) %>%
	filter(!str_detect(query, paste0("\\w ", suffix))) # remove "Nutzer innen"
	} else {
	q <- corpusQuery(kco, df$query, vc=df$vc, metadataOnly = FALSE) %>%
	fetchAll()
	cases <- q@collectedMatches$snippet %>%
	str_replace_all(paste0(".<mark>.\\w(\\W+)", suffix, "</mark>.*"), "\\1") %>%
	as_tibble() %>%
	group_by(value) %>%
	summarise(n = n())
	df %>% uncount(nrow(cases)) %>%
	mutate(query = str_replace(query, paste0(" (?=", suffix, ")"), cases$value), totalResults = cases$n)
	}
	}

	oauthorizeDemo <- function(kco, app_id = demo_kor_app_id) {
	if (is.null(kco@accessToken) \|\| is.null(kco@welcome)) { # if access token is not set or invalid
	kco@accessToken <- ( # request one
	oauth_client(
	id = app_id, # for the demo application
	token_url = paste0(kco@apiUrl, "oauth2/token")
	) %>%
	oauth_flow_auth_code(
	scope = "search match_info",
	auth_url = paste0(kco@KorAPUrl, "settings/oauth/authorize")
	)
	)$access_token
	}
	kco
	}

	plotPluralGenderVariants <- function(word = "Nutzer",
	years = c(1995:2022),
	as.alternatives = FALSE,
	vc = "referTo ratskorpus-2023-1 & pubDate in",
	suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
	prefixes = c('', '"', '"', ''),
	kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
	hc <-
	frequencyQuery(kco, paste0(prefixes, word, suffixes), paste(vc, years), as.alternatives=as.alternatives) %>%
	unravelPunctuationGenderCases(kco = kco) %>%
	hc_freq_by_year_ci(as.alternatives)
	print(hc)
	hc
	}


	hc <- plotPluralGenderVariants("Nutzer", c(1995:2022), as.alternatives = FALSE)

	getOKKSourceTitles <- function() {
	db <- dbConnect(MariaDB(), host="klinux10.ids-mannheim.de", user="viewer", dbname="corpora")
	dbExecute(db, "SET NAMES 'utf8'")
	rs <- dbSendQuery(db, "SELECT title from basename WHERE basename.rsr")
	corpus_parts <- dbFetch(rs)
	dbClearResult(rs)
	dbDisconnect(db)
	return(corpus_parts$title)
	}

	sourceDispersions <- function(word = "Bürger",
	sourceTitles = getOKKSourceTitles(),
	as.alternatives = FALSE,
	vc = 'referTo ratskorpus-2023-1 & corpusTitle="',
	suffixes = c('Innen', '[\\*]innen"', '[_]innen"', ' innen'),
	prefixes = c('', '"', '"', ''),
	kco = new("KorAPConnection", verbose=TRUE) %>% oauthorizeDemo()) {
	df <-
	frequencyQuery(kco, paste0(prefixes, word, suffixes), paste0(vc, sourceTitles, '"'), as.alternatives=as.alternatives) %>%
	unravelPunctuationGenderCases(kco = kco) %>%
	mutate(Quelle=str_replace_all(.$vc, '(.*="\|"$)', '')) %>%
	ipm() %>%
	filter(total > 0) %>%
	rename(Variante=query)

	dispersions <- df %>%
	group_by(Variante) %>%
	mutate(total_size=sum(total), rel_size=total/total_size) %>%
	group_modify(~ as_tibble(dispersions2(.x$totalResults, .x$rel_size))) %>%
	pivot_longer(cols= -1) %>%
	filter(! str_detect(name, " equally")) %>%
	mutate(name=str_replace(name, " ?\\(.*\\)", '')) %>%
	mutate_when(str_detect(name, "DPnorm"), name = "1-DP_norm", value = 1 -value) %>%
	mutate(across(where(is.double)), value = round(value, 2)) %>%
	mutate_when(str_detect(name, "(corpus\|range)"), value = as.integer(value)) %>%
	group_by(name) %>%
	mutate(rank = if_else(str_detect(name, "Kullback"), rank(value), rank(-value))) %>%
	group_by(Variante) %>%
	# mutate(rank = mean(rank, na.rm = FALSE)) %>%
	bind_rows(summarise(., name="Avg. Rank", value = mean(rank, na.rm = FALSE))) %>%
	select(-rank) %>%
	pivot_wider(names_from = Variante) %>%
	rename(measure=name)

	return(list(df, dispersions))
	}

	df_dispersions <- sourceDispersions("Nutzer")
	View(df_dispersions[[2]])

	# htmlwidgets::saveWidget(hc, file=fname, selfcontained = TRUE)

	# Diewald, Nils/Kupietz, Marc/L\u00FCngen, Harald (2022):
	# Tokenizing on scale. Preprocessing large text corpora on the lexical and sentence level.
	# In: Klosa-K\u00FCckelhaus, Annette/Engelberg, Stefan/M\u00F6hrs, Christine/Storjohann, Petra (eds):
	# Dictionaries and Society. Proceedings of the XX EURALEX International Congress, 12-16 July 2022.
	# Mannheim: IDS-Verlag, 2022: 208-221.
	# <https://doi.org/10.14618/ids-pub-11146>