Add script to generate useful subcorpora
diff --git a/scripts/useful_deliko_subcorpora.R b/scripts/useful_deliko_subcorpora.R
new file mode 100644
index 0000000..cf72f5a
--- /dev/null
+++ b/scripts/useful_deliko_subcorpora.R
@@ -0,0 +1,251 @@
+#!/usr/bin/env Rscript
+
+# Generate static HTML tables with curated DeLiKo@DNB virtual subcorpora
+# Creates one German and one English HTML page for embedding (e.g., via iframe)
+
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(DT)
+  library(glue)
+  library(RKorAPClient)
+  library(htmlwidgets)
+})
+
+korap_url <- "https://korap.dnb.de/"
+
+kco <- KorAPConnection(KorAPUrl = korap_url, verbose = TRUE)
+
+# Base genre definitions adapted from deliko-xl-composition.R
+genres_regex <- c(
+  "Krimi",
+  "(Erotik|Gay)",
+  "Western",
+  "Arzt",
+  "Liebes",
+  "Heimat",
+  "(Horror|Grusel|Vampir)",
+  "Historisch",
+  "Fantasy",
+  "Science",
+  "Jugend",
+  "Mystery"
+)
+
+genres_de <- c(
+  "Krimi",
+  "Erotik",
+  "Western",
+  "Arztroman",
+  "Liebesroman",
+  "Heimatroman",
+  "Horror",
+  "Historischer Roman",
+  "Fantasy",
+  "Science Fiction",
+  "Jugendbuch",
+  "Mystery",
+  "Sonstiges"
+)
+
+genres_en <- c(
+  "crime",
+  "erotic",
+  "western",
+  "doctor novel",
+  "romance",
+  "homeland",
+  "horror",
+  "historic",
+  "fantasy",
+  "science fiction",
+  "young adult",
+  "mystery",
+  "other"
+)
+
+other_query <- glue("textType!=/.*({paste0(genres_regex, collapse = '|')}).*/")
+
+genre_definitions <- tibble(
+  regex = c(genres_regex, NA_character_),
+  de_title = genres_de,
+  en_title = genres_en
+) %>%
+  mutate(
+    corpus_query = if_else(
+      is.na(regex),
+      other_query,
+      glue("textType=/.*{regex}.*/")
+    ),
+    id = str_replace_all(str_to_lower(en_title), "[^a-z0-9]+", "_") %>%
+      str_replace_all("_+", "_") %>%
+      str_remove("^_") %>%
+      str_remove("_$")
+  ) %>%
+  mutate(
+    en_description = if_else(
+      en_title == "other",
+      "Novels that do not match the predefined genre heuristics.",
+      glue("Genre {en_title}.")
+    ),
+    de_description = if_else(
+      de_title == "Sonstiges",
+      "Romane, die keiner der vordefinierten Genre-Heuristiken entsprechen.",
+      glue("Genre {de_title}.")
+    )
+  )
+
+genre_definitions <- genre_definitions %>% select(-regex)
+
+# Additional manually curated virtual corpora; edit this list as needed
+extra_corpora <- tribble(
+  ~id, ~de_title, ~en_title, ~corpus_query, ~de_description, ~en_description,
+  "award_winners", "Buchpreisträger", "Award winners", "award=buchpreis",
+  "Buchpreisträger.", "Award winners."
+)
+
+vc_definitions <- bind_rows(genre_definitions, extra_corpora)
+
+stats <- map_dfr(vc_definitions$corpus_query, function(vc) {
+  corpusStats(kco, vc, as.df = TRUE) %>%
+    select(-vc)
+})
+
+vc_definitions <- bind_cols(vc_definitions, stats)
+
+# Remove corpora without tokens, but leave a note in the console
+zero_token <- vc_definitions %>% filter(tokens == 0)
+if (nrow(zero_token) > 0) {
+  warning("Found corpora with 0 tokens (omitted from output):\n", paste0(" - ", zero_token$en_title, collapse = "\n"))
+  vc_definitions <- vc_definitions %>% filter(tokens > 0)
+}
+
+vc_definitions <- vc_definitions %>% arrange(
+  str_remove(en_description, "\\.$"),
+  str_remove(de_description, "\\.$"),
+  de_title
+)
+
+encode_query <- function(q) {
+  utils::URLencode(q, reserved = TRUE)
+}
+
+make_copy_icon <- function(text) {
+  glue(
+    " <span class='copy-icon' style='cursor: pointer; color: #6c757d; margin-left: 5px; font-size: 0.9em;' title='Click to copy to clipboard' data-copy='{text}'>📋</span>"
+  )
+}
+
+korap_link <- function(title, query) {
+  href <- glue("{korap_url}?cq={encode_query(query)}&ql=poliqarp&cutoff=1")
+  glue("<a title='Open KorAP with {title} as virtual corpus' target='_blank' href='{href}'>{title}</a>")
+}
+
+copy_js <- JS(
+  "function(el) {",
+  "  document.body.style.padding = '0';",
+  "  $(el).on('click', '.copy-icon', function(e) {",
+  "    e.preventDefault();",
+  "    var textToCopy = $(this).data('copy');",
+  "    if (navigator.clipboard && window.isSecureContext) {",
+  "      navigator.clipboard.writeText(textToCopy).then(function() {",
+  "        var originalIcon = $(e.target).text();",
+  "        $(e.target).text('✓').css('color', '#28a745');",
+  "        setTimeout(function() {",
+  "          $(e.target).text(originalIcon).css('color', '#6c757d');",
+  "        }, 1000);",
+  "      }).catch(function(err) {",
+  "        console.error('Failed to copy text: ', err);",
+  "        fallbackCopyTextToClipboard(textToCopy, e.target);",
+  "      });",
+  "    } else {",
+  "      fallbackCopyTextToClipboard(textToCopy, e.target);",
+  "    }",
+  "  });",
+  "  function fallbackCopyTextToClipboard(text, target) {",
+  "    var textArea = document.createElement('textarea');",
+  "    textArea.value = text;",
+  "    textArea.style.position = 'fixed';",
+  "    textArea.style.left = '-999999px';",
+  "    textArea.style.top = '-999999px';",
+  "    document.body.appendChild(textArea);",
+  "    textArea.focus();",
+  "    textArea.select();",
+  "    try {",
+  "      var successful = document.execCommand('copy');",
+  "      if (successful) {",
+  "        var originalIcon = $(target).text();",
+  "        $(target).text('✓').css('color', '#28a745');",
+  "        setTimeout(function() {",
+  "          $(target).text(originalIcon).css('color', '#6c757d');",
+  "        }, 1000);",
+  "      }",
+  "    } catch (err) {",
+  "      console.error('Fallback: unable to copy', err);",
+  "    }",
+  "    document.body.removeChild(textArea);",
+  "  }",
+  "}"
+)
+
+format_number_de <- function(x) format(x, big.mark = ".", decimal.mark = ",", scientific = FALSE, trim = TRUE)
+format_number_en <- function(x) format(x, big.mark = ",", decimal.mark = ".", scientific = FALSE, trim = TRUE)
+
+make_table <- function(data, locale = c("de", "en")) {
+  locale <- match.arg(locale)
+  if (locale == "de") {
+    link_text <- str_remove(data$de_description, "\\.$")
+    tokens <- format_number_de(data$tokens)
+    documents <- format_number_de(data$documents)
+    vc_header <- "VC-Definition (für Client-Bibliotheken)"
+    title_header <- "Titel / Link"
+    tokens_header <- "Tokens"
+    documents_header <- "Dokumente"
+    language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/German.json"
+  } else {
+    link_text <- str_remove(data$en_description, "\\.$")
+    tokens <- format_number_en(data$tokens)
+    documents <- format_number_en(data$documents)
+    vc_header <- "VC definition (for client libraries)"
+    title_header <- "Title / Link"
+    tokens_header <- "Tokens"
+    documents_header <- "Documents"
+    language_url <- "//cdn.datatables.net/plug-ins/1.10.11/i18n/English.json"
+  }
+
+  table_data <- tibble(
+    `{{title_header}}` := map2_chr(link_text, data$corpus_query, korap_link),
+    `{{vc_header}}` := paste0(data$corpus_query, make_copy_icon(data$corpus_query)),
+    `{{tokens_header}}` := tokens,
+    `{{documents_header}}` := documents
+  )
+
+  names(table_data) <- c(title_header, vc_header, tokens_header, documents_header)
+
+  datatable(
+    table_data,
+    escape = FALSE,
+    options = list(
+      paging = FALSE,
+      dom = "tp",
+      language = list(url = language_url),
+      initComplete = JS(
+        "function(settings, json) {",
+        "  $(this.api().table().header()).css({'font-family': 'Fira Sans'});",
+        "  $(this.api().table().body()).css({'font-family': 'Fira Sans'});",
+        "}"
+      )
+    ),
+    filter = "bottom",
+    rownames = FALSE
+  ) %>%
+    formatStyle(colnames(table_data), fontFamily = "Fira Sans, Lato, sans-serif") %>%
+    htmlwidgets::onRender(copy_js)
+}
+
+widget_de <- make_table(vc_definitions, "de")
+widget_en <- make_table(vc_definitions, "en")
+
+saveWidget(widget_de, file = "useful_deliko_subcorpora_de.html", selfcontained = TRUE)
+saveWidget(widget_en, file = "useful_deliko_subcorpora_en.html", selfcontained = TRUE)
+
+message("Generated useful_deliko_subcorpora_de.html and useful_deliko_subcorpora_en.html")