Add script to generate useful subcorpora
diff --git a/scripts/useful_deliko_subcorpora.R b/scripts/useful_deliko_subcorpora.R
new file mode 100644
index 0000000..cf72f5a
--- /dev/null
+++ b/scripts/useful_deliko_subcorpora.R
@@ -0,0 +1,251 @@
+#!/usr/bin/env Rscript
+
+# Generate static HTML tables with curated DeLiKo@DNB virtual subcorpora
+# Creates one German and one English HTML page for embedding (e.g., via iframe)
+
+suppressPackageStartupMessages({
+ library(tidyverse)
+ library(DT)
+ library(glue)
+ library(RKorAPClient)
+ library(htmlwidgets)
+})
+
+korap_url <- "https://korap.dnb.de/"
+
+kco <- KorAPConnection(KorAPUrl = korap_url, verbose = TRUE)
+
+# Base genre definitions adapted from deliko-xl-composition.R
+genres_regex <- c(
+ "Krimi",
+ "(Erotik|Gay)",
+ "Western",
+ "Arzt",
+ "Liebes",
+ "Heimat",
+ "(Horror|Grusel|Vampir)",
+ "Historisch",
+ "Fantasy",
+ "Science",
+ "Jugend",
+ "Mystery"
+)
+
+genres_de <- c(
+ "Krimi",
+ "Erotik",
+ "Western",
+ "Arztroman",
+ "Liebesroman",
+ "Heimatroman",
+ "Horror",
+ "Historischer Roman",
+ "Fantasy",
+ "Science Fiction",
+ "Jugendbuch",
+ "Mystery",
+ "Sonstiges"
+)
+
+genres_en <- c(
+ "crime",
+ "erotic",
+ "western",
+ "doctor novel",
+ "romance",
+ "homeland",
+ "horror",
+ "historic",
+ "fantasy",
+ "science fiction",
+ "young adult",
+ "mystery",
+ "other"
+)
+
+other_query <- glue("textType!=/.*({paste0(genres_regex, collapse = '|')}).*/")
+
+genre_definitions <- tibble(
+ regex = c(genres_regex, NA_character_),
+ de_title = genres_de,
+ en_title = genres_en
+) %>%
+ mutate(
+ corpus_query = if_else(
+ is.na(regex),
+ other_query,
+ glue("textType=/.*{regex}.*/")
+ ),
+ id = str_replace_all(str_to_lower(en_title), "[^a-z0-9]+", "_") %>%
+ str_replace_all("_+", "_") %>%
+ str_remove("^_") %>%
+ str_remove("_$")
+ ) %>%
+ mutate(
+ en_description = if_else(
+ en_title == "other",
+ "Novels that do not match the predefined genre heuristics.",
+ glue("Genre {en_title}.")
+ ),
+ de_description = if_else(
+ de_title == "Sonstiges",
+ "Romane, die keiner der vordefinierten Genre-Heuristiken entsprechen.",
+ glue("Genre {de_title}.")
+ )
+ )
+
+genre_definitions <- genre_definitions %>% select(-regex)
+
+# Additional manually curated virtual corpora; edit this list as needed
+extra_corpora <- tribble(
+ ~id, ~de_title, ~en_title, ~corpus_query, ~de_description, ~en_description,
+ "award_winners", "Buchpreisträger", "Award winners", "award=buchpreis",
+ "Buchpreisträger.", "Award winners."
+)
+
+vc_definitions <- bind_rows(genre_definitions, extra_corpora)
+
+stats <- map_dfr(vc_definitions$corpus_query, function(vc) {
+ corpusStats(kco, vc, as.df = TRUE) %>%
+ select(-vc)
+})
+
+vc_definitions <- bind_cols(vc_definitions, stats)
+
+# Remove corpora without tokens, but leave a note in the console
+zero_token <- vc_definitions %>% filter(tokens == 0)
+if (nrow(zero_token) > 0) {
+ warning("Found corpora with 0 tokens (omitted from output):\n", paste0(" - ", zero_token$en_title, collapse = "\n"))
+ vc_definitions <- vc_definitions %>% filter(tokens > 0)
+}
+
+vc_definitions <- vc_definitions %>% arrange(
+ str_remove(en_description, "\\.$"),
+ str_remove(de_description, "\\.$"),
+ de_title
+)
+
+encode_query <- function(q) {
+ utils::URLencode(q, reserved = TRUE)
+}
+
+make_copy_icon <- function(text) {
+ glue(
+ " <span class='copy-icon' style='cursor: pointer; color: #6c757d; margin-left: 5px; font-size: 0.9em;' title='Click to copy to clipboard' data-copy='{text}'>📋</span>"
+ )
+}
+
+korap_link <- function(title, query) {
+ href <- glue("{korap_url}?cq={encode_query(query)}&ql=poliqarp&cutoff=1")
+ glue("<a title='Open KorAP with {title} as virtual corpus' target='_blank' href='{href}'>{title}</a>")
+}
+
+copy_js <- JS(
+ "function(el) {",
+ " document.body.style.padding = '0';",
+ " $(el).on('click', '.copy-icon', function(e) {",
+ " e.preventDefault();",
+ " var textToCopy = $(this).data('copy');",
+ " if (navigator.clipboard && window.isSecureContext) {",
+ " navigator.clipboard.writeText(textToCopy).then(function() {",
+ " var originalIcon = $(e.target).text();",
+ " $(e.target).text('✓').css('color', '#28a745');",
+ " setTimeout(function() {",
+ " $(e.target).text(originalIcon).css('color', '#6c757d');",
+ " }, 1000);",
+ " }).catch(function(err) {",
+ " console.error('Failed to copy text: ', err);",
+ " fallbackCopyTextToClipboard(textToCopy, e.target);",
+ " });",
+ " } else {",
+ " fallbackCopyTextToClipboard(textToCopy, e.target);",
+ " }",
+ " });",
+ " function fallbackCopyTextToClipboard(text, target) {",
+ " var textArea = document.createElement('textarea');",
+ " textArea.value = text;",
+ " textArea.style.position = 'fixed';",
+ " textArea.style.left = '-999999px';",
+ " textArea.style.top = '-999999px';",
+ " document.body.appendChild(textArea);",
+ " textArea.focus();",
+ " textArea.select();",
+ " try {",
+ " var successful = document.execCommand('copy');",
+ " if (successful) {",
+ " var originalIcon = $(target).text();",
+ " $(target).text('✓').css('color', '#28a745');",
+ " setTimeout(function() {",
+ " $(target).text(originalIcon).css('color', '#6c757d');",
+ " }, 1000);",
+ " }",
+ " } catch (err) {",
+ " console.error('Fallback: unable to copy', err);",
+ " }",
+ " document.body.removeChild(textArea);",
+ " }",
+ "}"
+)
+
+format_number_de <- function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE, trim = TRUE)
+format_number_en <- function(x) format(x, big.mark = ",", decimal.mark = ".", scientific = FALSE, trim = TRUE)
+
+make_table <- function(data, locale = c("de", "en")) {
+ locale <- match.arg(locale)
+ if (locale == "de") {
+ link_text <- str_remove(data$de_description, "\\.$")
+ tokens <- format_number_de(data$tokens)
+ documents <- format_number_de(data$documents)
+ vc_header <- "VC-Definition (für Client-Bibliotheken)"
+ title_header <- "Titel / Link"
+ tokens_header <- "Tokens"
+ documents_header <- "Dokumente"
+ language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/German.json"
+ } else {
+ link_text <- str_remove(data$en_description, "\\.$")
+ tokens <- format_number_en(data$tokens)
+ documents <- format_number_en(data$documents)
+ vc_header <- "VC definition (for client libraries)"
+ title_header <- "Title / Link"
+ tokens_header <- "Tokens"
+ documents_header <- "Documents"
+ language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/English.json"
+ }
+
+ table_data <- tibble(
+ `{{title_header}}` := map2_chr(link_text, data$corpus_query, korap_link),
+ `{{vc_header}}` := paste0(data$corpus_query, make_copy_icon(data$corpus_query)),
+ `{{tokens_header}}` := tokens,
+ `{{documents_header}}` := documents
+ )
+
+ names(table_data) <- c(title_header, vc_header, tokens_header, documents_header)
+
+ datatable(
+ table_data,
+ escape = FALSE,
+ options = list(
+ paging = FALSE,
+ dom = "tp",
+ language = list(url = language_url),
+ initComplete = JS(
+ "function(settings, json) {",
+ " $(this.api().table().header()).css({'font-family': 'Fira Sans'});",
+ " $(this.api().table().body()).css({'font-family': 'Fira Sans'});",
+ "}"
+ )
+ ),
+ filter = "bottom",
+ rownames = FALSE
+ ) %>%
+ formatStyle(colnames(table_data), fontFamily = "Fira Sans, Lato, sans-serif") %>%
+ htmlwidgets::onRender(copy_js)
+}
+
+widget_de <- make_table(vc_definitions, "de")
+widget_en <- make_table(vc_definitions, "en")
+
+saveWidget(widget_de, file = "useful_deliko_subcorpora_de.html", selfcontained = TRUE)
+saveWidget(widget_en, file = "useful_deliko_subcorpora_en.html", selfcontained = TRUE)
+
+message("Generated useful_deliko_subcorpora_de.html and useful_deliko_subcorpora_en.html")