Add a script that guesses origial title and pub date
diff --git a/scripts/originTraceR.R b/scripts/originTraceR.R
new file mode 100644
index 0000000..f064fa5
--- /dev/null
+++ b/scripts/originTraceR.R
@@ -0,0 +1,317 @@
+library(RKorAPClient)
+library(tidyverse)
+library(urltools)
+library(tidyllm)
+
+VC <- "author=/Dickens.*/"
+deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
+query <- corpusQuery(
+ deliko,
+ "<base/s=t>",
+ # this finds each text once
+ vc = VC,
+ fields = c(
+ "textSigle",
+ "title",
+ "subTitle",
+ "author",
+ "translator",
+ "textType",
+ "textTypeRef",
+ "publisher",
+ "pubDate",
+ "pubPlace",
+ "ISBN",
+ "URN"
+ )
+) %>%
+ fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
+
+df <- query@collectedMatches
+
+df <- df %>%
+ mutate(
+ publisher = str_replace_all(publisher, "data:,", ""),
+ URN = URLdecode(str_replace_all(URN, ".*,", ""))
+ ) %>%
+ select(-c("matchStart", "matchEnd"))
+
+# write_tsv(df, "deliko_metadata.tsv")
+
+# Build a compact prompt requesting strict JSON for translation/original info
+build_translation_prompt <- function(
+ title,
+ author = NA_character_,
+ subTitle = NA_character_,
+ translator = NA_character_,
+ publisher = NA_character_,
+ pubDate = NA_character_,
+ ISBN = NA_character_) {
+ fields <- c(
+ sprintf("Titel: %s", dplyr::coalesce(title, "")),
+ sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
+ sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
+ sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
+ sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
+ sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
+ sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
+ )
+ context <- paste(fields, collapse = "\n")
+ paste0(
+ "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
+ "\n1) Ob es eine Übersetzung ist.",
+ "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
+ "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
+ "\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string|null>,\n \"original_publication_year\": <integer|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
+ "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
+ "\n\nMetadaten:\n",
+ context
+ )
+}
+
+# Resolve tidyllm provider from model prefix
+resolve_provider_for_model <- function(model) {
+ if (grepl("^gpt-", model, ignore.case = TRUE)) {
+ return(tidyllm::openai())
+ } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+ return(tidyllm::claude())
+ } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+ return(tidyllm::gemini())
+ } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+ return(tidyllm::deepseek())
+ }
+ # Default fallback
+ tidyllm::deepseek()
+}
+
+# Friendly API key checks per provider family
+ensure_api_key_for_model <- function(model) {
+ if (grepl("^gpt-", model, ignore.case = TRUE)) {
+ if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
+ stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
+ }
+ } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+ if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
+ stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
+ }
+ } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+ if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
+ stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
+ }
+ } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+ if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
+ stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
+ }
+ }
+ invisible(TRUE)
+}
+
+# Infer original title and year for a single record via LLM (DeepSeek by default)
+infer_original_single <- function(
+ title,
+ author = NA_character_,
+ subTitle = NA_character_,
+ translator = NA_character_,
+ publisher = NA_character_,
+ pubDate = NA_character_,
+ ISBN = NA_character_,
+ provider = NULL,
+ model = "deepseek-chat",
+ temperature = 0.1,
+ max_tries = 3,
+ timeout = 60) {
+ prompt <- build_translation_prompt(
+ title = title,
+ author = author,
+ subTitle = subTitle,
+ translator = translator,
+ publisher = publisher,
+ pubDate = pubDate,
+ ISBN = ISBN
+ )
+
+ # Resolve provider if not supplied
+ if (is.null(provider)) {
+ provider <- resolve_provider_for_model(model)
+ }
+
+ result <- try(
+ {
+ tidyllm::llm_message(prompt) |>
+ tidyllm::chat(
+ .provider = provider,
+ .model = model,
+ .temperature = temperature,
+ .timeout = timeout,
+ .max_tries = max_tries
+ )
+ },
+ silent = TRUE
+ )
+
+ if (inherits(result, "try-error")) {
+ return(tibble::tibble(
+ original_title = NA_character_,
+ original_publication_year = as.integer(NA),
+ orig_confidence = as.numeric(NA)
+ ))
+ }
+
+ # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
+ parsed <- NULL
+ suppressWarnings({
+ parsed <- tidyllm::get_reply_data(result)
+ })
+
+ if (is.null(parsed)) {
+ # Try to extract JSON manually as a very last resort
+ raw <- tidyllm::get_reply(result)
+ json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
+ if (!is.na(json_txt)) {
+ parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
+ if (inherits(parsed, "try-error")) parsed <- NULL
+ }
+ }
+
+ if (is.null(parsed)) {
+ return(tibble::tibble(
+ original_title = NA_character_,
+ original_publication_year = as.integer(NA),
+ orig_confidence = as.numeric(NA)
+ ))
+ }
+
+ ot <- parsed[["original_title"]]
+ oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
+ cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
+
+ if (!is.na(cf)) cf <- max(0, min(1, cf))
+
+ tibble::tibble(
+ original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
+ original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
+ orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
+ )
+}
+
+# Public API: augment an existing metadata df with original title/year (+ optional confidence)
+augment_metadata_with_original <- function(
+ df,
+ provider = NULL,
+ model = "deepseek-chat",
+ include_confidence = TRUE,
+ temperature = 0.1,
+ max_tries = 3,
+ timeout = 60,
+ pause_seconds = 0,
+ check_api_key = TRUE,
+ show_progress = TRUE,
+ progress_style = 3) {
+ stopifnot(is.data.frame(df))
+
+ needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
+ missing <- setdiff(needed, names(df))
+ if (length(missing) > 0) {
+ stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
+ }
+
+ # Friendly pre-check for required API key depending on the chosen model/provider
+ if (isTRUE(check_api_key)) {
+ ensure_api_key_for_model(model)
+ }
+
+ # Resolve provider if not supplied
+ if (is.null(provider)) {
+ provider <- resolve_provider_for_model(model)
+ }
+
+ # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
+ input_cols <- tibble::tibble(
+ title = df[["title"]],
+ author = df[["author"]],
+ subTitle = df[["subTitle"]],
+ translator = df[["translator"]],
+ publisher = df[["publisher"]],
+ pubDate = df[["pubDate"]],
+ ISBN = df[["ISBN"]]
+ )
+
+ infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
+ res <- infer_original_single(
+ title = title,
+ author = author,
+ subTitle = subTitle,
+ translator = translator,
+ publisher = publisher,
+ pubDate = pubDate,
+ ISBN = ISBN,
+ provider = provider,
+ model = model,
+ temperature = temperature,
+ max_tries = max_tries,
+ timeout = timeout
+ )
+ if (pause_seconds > 0) Sys.sleep(pause_seconds)
+ res
+ }
+
+ n <- nrow(input_cols)
+ results_list <- vector("list", n)
+ empty_res <- tibble::tibble(
+ original_title = NA_character_,
+ original_publication_year = as.integer(NA),
+ orig_confidence = as.numeric(NA)
+ )
+ pb <- NULL
+ if (isTRUE(show_progress) && n > 0) {
+ pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
+ }
+ for (i in seq_len(n)) {
+ res <- tryCatch(
+ infer_wrapper(
+ title = input_cols$title[i],
+ author = input_cols$author[i],
+ subTitle = input_cols$subTitle[i],
+ translator = input_cols$translator[i],
+ publisher = input_cols$publisher[i],
+ pubDate = input_cols$pubDate[i],
+ ISBN = input_cols$ISBN[i]
+ ),
+ error = function(e) empty_res
+ )
+ if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
+ res <- empty_res
+ } else if (nrow(res) > 1) {
+ res <- res[1, , drop = FALSE]
+ }
+ results_list[[i]] <- res
+ if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
+ }
+ if (!is.null(pb)) close(pb)
+ results_tbl <- dplyr::bind_rows(results_list)
+ if (nrow(results_tbl) != n) {
+ # Ensure row count aligns with df to avoid recycling errors
+ if (nrow(results_tbl) < n) {
+ pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
+ results_tbl <- dplyr::bind_rows(results_tbl, pad)
+ } else if (nrow(results_tbl) > n) {
+ results_tbl <- dplyr::slice(results_tbl, seq_len(n))
+ }
+ }
+
+ out <- dplyr::bind_cols(df, results_tbl)
+ if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
+ out
+}
+
+# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
+df_aug <- augment_metadata_with_original(df, model = "deepseek")
+
+# # OpenAI
+# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
+
+# # Claude
+#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
+
+# # Gemini
+# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")