Add a script that guesses origial title and pub date

commit: 43bb93ff2bee7ff43428f99bb64decd843bd44f7 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Sep 25 20:59:14 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Sep 25 20:59:14 2025 +0200
tree: 373d0a45c1a2815f6cee086346e0882f03dc0bff
parent: 0eb3749521174478b6f093e2b08d84226a68a6fb [diff] [blame]
diff --git a/scripts/originTraceR.R b/scripts/originTraceR.R
new file mode 100644
index 0000000..f064fa5
--- /dev/null
+++ b/scripts/originTraceR.R

@@ -0,0 +1,317 @@
+library(RKorAPClient)
+library(tidyverse)
+library(urltools)
+library(tidyllm)
+
+VC <- "author=/Dickens.*/"
+deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
+query <- corpusQuery(
+  deliko,
+  "<base/s=t>",
+  # this finds each text once
+  vc = VC,
+  fields = c(
+    "textSigle",
+    "title",
+    "subTitle",
+    "author",
+    "translator",
+    "textType",
+    "textTypeRef",
+    "publisher",
+    "pubDate",
+    "pubPlace",
+    "ISBN",
+    "URN"
+  )
+) %>%
+  fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
+
+df <- query@collectedMatches
+
+df <- df %>%
+  mutate(
+    publisher = str_replace_all(publisher, "data:,", ""),
+    URN = URLdecode(str_replace_all(URN, ".*,", ""))
+  ) %>%
+  select(-c("matchStart", "matchEnd"))
+
+# write_tsv(df, "deliko_metadata.tsv")
+
+# Build a compact prompt requesting strict JSON for translation/original info
+build_translation_prompt <- function(
+    title,
+    author = NA_character_,
+    subTitle = NA_character_,
+    translator = NA_character_,
+    publisher = NA_character_,
+    pubDate = NA_character_,
+    ISBN = NA_character_) {
+  fields <- c(
+    sprintf("Titel: %s", dplyr::coalesce(title, "")),
+    sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
+    sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
+    sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
+    sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
+    sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
+    sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
+  )
+  context <- paste(fields, collapse = "\n")
+  paste0(
+    "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
+    "\n1) Ob es eine Übersetzung ist.",
+    "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
+    "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
+    "\n{\n  \"is_translation\": <boolean>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"confidence\": <number zwischen 0 und 1>\n}",
+    "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
+    "\n\nMetadaten:\n",
+    context
+  )
+}
+
+# Resolve tidyllm provider from model prefix
+resolve_provider_for_model <- function(model) {
+  if (grepl("^gpt-", model, ignore.case = TRUE)) {
+    return(tidyllm::openai())
+  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+    return(tidyllm::claude())
+  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+    return(tidyllm::gemini())
+  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+    return(tidyllm::deepseek())
+  }
+  # Default fallback
+  tidyllm::deepseek()
+}
+
+# Friendly API key checks per provider family
+ensure_api_key_for_model <- function(model) {
+  if (grepl("^gpt-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
+      stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
+    }
+  } else if (grepl("^claude-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
+      stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
+    }
+  } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
+      stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
+    }
+  } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
+    if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
+      stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
+    }
+  }
+  invisible(TRUE)
+}
+
+# Infer original title and year for a single record via LLM (DeepSeek by default)
+infer_original_single <- function(
+    title,
+    author = NA_character_,
+    subTitle = NA_character_,
+    translator = NA_character_,
+    publisher = NA_character_,
+    pubDate = NA_character_,
+    ISBN = NA_character_,
+    provider = NULL,
+    model = "deepseek-chat",
+    temperature = 0.1,
+    max_tries = 3,
+    timeout = 60) {
+  prompt <- build_translation_prompt(
+    title = title,
+    author = author,
+    subTitle = subTitle,
+    translator = translator,
+    publisher = publisher,
+    pubDate = pubDate,
+    ISBN = ISBN
+  )
+
+  # Resolve provider if not supplied
+  if (is.null(provider)) {
+    provider <- resolve_provider_for_model(model)
+  }
+
+  result <- try(
+    {
+      tidyllm::llm_message(prompt) |>
+        tidyllm::chat(
+          .provider = provider,
+          .model = model,
+          .temperature = temperature,
+          .timeout = timeout,
+          .max_tries = max_tries
+        )
+    },
+    silent = TRUE
+  )
+
+  if (inherits(result, "try-error")) {
+    return(tibble::tibble(
+      original_title = NA_character_,
+      original_publication_year = as.integer(NA),
+      orig_confidence = as.numeric(NA)
+    ))
+  }
+
+  # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
+  parsed <- NULL
+  suppressWarnings({
+    parsed <- tidyllm::get_reply_data(result)
+  })
+
+  if (is.null(parsed)) {
+    # Try to extract JSON manually as a very last resort
+    raw <- tidyllm::get_reply(result)
+    json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
+    if (!is.na(json_txt)) {
+      parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
+      if (inherits(parsed, "try-error")) parsed <- NULL
+    }
+  }
+
+  if (is.null(parsed)) {
+    return(tibble::tibble(
+      original_title = NA_character_,
+      original_publication_year = as.integer(NA),
+      orig_confidence = as.numeric(NA)
+    ))
+  }
+
+  ot <- parsed[["original_title"]]
+  oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
+  cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
+
+  if (!is.na(cf)) cf <- max(0, min(1, cf))
+
+  tibble::tibble(
+    original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
+    original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
+    orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
+  )
+}
+
+# Public API: augment an existing metadata df with original title/year (+ optional confidence)
+augment_metadata_with_original <- function(
+    df,
+    provider = NULL,
+    model = "deepseek-chat",
+    include_confidence = TRUE,
+    temperature = 0.1,
+    max_tries = 3,
+    timeout = 60,
+    pause_seconds = 0,
+    check_api_key = TRUE,
+    show_progress = TRUE,
+    progress_style = 3) {
+  stopifnot(is.data.frame(df))
+
+  needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
+  missing <- setdiff(needed, names(df))
+  if (length(missing) > 0) {
+    stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
+  }
+
+  # Friendly pre-check for required API key depending on the chosen model/provider
+  if (isTRUE(check_api_key)) {
+    ensure_api_key_for_model(model)
+  }
+
+  # Resolve provider if not supplied
+  if (is.null(provider)) {
+    provider <- resolve_provider_for_model(model)
+  }
+
+  # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
+  input_cols <- tibble::tibble(
+    title = df[["title"]],
+    author = df[["author"]],
+    subTitle = df[["subTitle"]],
+    translator = df[["translator"]],
+    publisher = df[["publisher"]],
+    pubDate = df[["pubDate"]],
+    ISBN = df[["ISBN"]]
+  )
+
+  infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
+    res <- infer_original_single(
+      title = title,
+      author = author,
+      subTitle = subTitle,
+      translator = translator,
+      publisher = publisher,
+      pubDate = pubDate,
+      ISBN = ISBN,
+      provider = provider,
+      model = model,
+      temperature = temperature,
+      max_tries = max_tries,
+      timeout = timeout
+    )
+    if (pause_seconds > 0) Sys.sleep(pause_seconds)
+    res
+  }
+
+  n <- nrow(input_cols)
+  results_list <- vector("list", n)
+  empty_res <- tibble::tibble(
+    original_title = NA_character_,
+    original_publication_year = as.integer(NA),
+    orig_confidence = as.numeric(NA)
+  )
+  pb <- NULL
+  if (isTRUE(show_progress) && n > 0) {
+    pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
+  }
+  for (i in seq_len(n)) {
+    res <- tryCatch(
+      infer_wrapper(
+        title = input_cols$title[i],
+        author = input_cols$author[i],
+        subTitle = input_cols$subTitle[i],
+        translator = input_cols$translator[i],
+        publisher = input_cols$publisher[i],
+        pubDate = input_cols$pubDate[i],
+        ISBN = input_cols$ISBN[i]
+      ),
+      error = function(e) empty_res
+    )
+    if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
+      res <- empty_res
+    } else if (nrow(res) > 1) {
+      res <- res[1, , drop = FALSE]
+    }
+    results_list[[i]] <- res
+    if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
+  }
+  if (!is.null(pb)) close(pb)
+  results_tbl <- dplyr::bind_rows(results_list)
+  if (nrow(results_tbl) != n) {
+    # Ensure row count aligns with df to avoid recycling errors
+    if (nrow(results_tbl) < n) {
+      pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
+      results_tbl <- dplyr::bind_rows(results_tbl, pad)
+    } else if (nrow(results_tbl) > n) {
+      results_tbl <- dplyr::slice(results_tbl, seq_len(n))
+    }
+  }
+
+  out <- dplyr::bind_cols(df, results_tbl)
+  if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
+  out
+}
+
+# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
+df_aug <- augment_metadata_with_original(df, model = "deepseek")
+
+#  # OpenAI
+# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
+
+#  # Claude
+#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
+
+#  # Gemini
+#  df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
commit	43bb93ff2bee7ff43428f99bb64decd843bd44f7	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Sep 25 20:59:14 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Sep 25 20:59:14 2025 +0200
tree	373d0a45c1a2815f6cee086346e0882f03dc0bff
parent	0eb3749521174478b6f093e2b08d84226a68a6fb [diff] [blame]