originTraceR: default to English prompt and add error messages

commit: 1bfd8304d6da39511bb73cbd57b9d1e04ae2a5cf [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Sep 25 21:14:45 2025 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Sep 26 09:13:50 2025 +0200
tree: d23bacf7fdfecdd54d8034f240a0183db05e1517
parent: 43bb93ff2bee7ff43428f99bb64decd843bd44f7 [diff]
diff --git a/scripts/originTraceR.R b/scripts/originTraceR.R
index f064fa5..06ca031 100644
--- a/scripts/originTraceR.R
+++ b/scripts/originTraceR.R

@@ -46,27 +46,52 @@
     translator = NA_character_,
     publisher = NA_character_,
     pubDate = NA_character_,
-    ISBN = NA_character_) {
-  fields <- c(
-    sprintf("Titel: %s", dplyr::coalesce(title, "")),
-    sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
-    sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
-    sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
-    sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
-    sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
-    sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
-  )
-  context <- paste(fields, collapse = "\n")
-  paste0(
-    "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
-    "\n1) Ob es eine Übersetzung ist.",
-    "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
-    "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
-    "\n{\n  \"is_translation\": <boolean>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"confidence\": <number zwischen 0 und 1>\n}",
-    "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
-    "\n\nMetadaten:\n",
-    context
-  )
+    ISBN = NA_character_,
+    prompt_language = c("en", "de")) {
+  prompt_language <- match.arg(prompt_language)
+  if (prompt_language == "de") {
+    fields <- c(
+      sprintf("Titel: %s", dplyr::coalesce(title, "")),
+      sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
+      sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
+      sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
+      sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
+      sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
+      sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
+    )
+    context <- paste(fields, collapse = "\n")
+    paste0(
+      "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
+      "\n1) Ob es eine Übersetzung ist.",
+      "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
+      "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
+      "\n{\n  \"is_translation\": <boolean>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"confidence\": <number zwischen 0 und 1>\n}",
+      "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
+      "\n\nMetadaten:\n",
+      context
+    )
+  } else {
+    fields <- c(
+      sprintf("Title: %s", dplyr::coalesce(title, "")),
+      sprintf("Subtitle: %s", dplyr::coalesce(subTitle, "")),
+      sprintf("Author: %s", dplyr::coalesce(author, "")),
+      sprintf("Translator: %s", dplyr::coalesce(translator, "")),
+      sprintf("Publisher: %s", dplyr::coalesce(publisher, "")),
+      sprintf("Publication year (German edition): %s", dplyr::coalesce(pubDate, "")),
+      sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
+    )
+    context <- paste(fields, collapse = "\n")
+    paste0(
+      "You are a bibliographic assistant. For the following German-language book, determine:",
+      "\n1) Whether it is a translation.",
+      "\n2) If yes: the original title and the year of first publication.",
+      "\n\nReturn a single JSON object only — no prose — with exactly these keys:",
+      "\n{\n  \"is_translation\": <boolean>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"confidence\": <number between 0 and 1>\n}",
+      "\nUse null for unknown. Output JSON only.",
+      "\n\nMetadata:\n",
+      context
+    )
+  }
 }
 
 # Resolve tidyllm provider from model prefix
@@ -127,7 +152,8 @@
     translator = translator,
     publisher = publisher,
     pubDate = pubDate,
-    ISBN = ISBN
+    ISBN = ISBN,
+    prompt_language = "en"
   )
 
   # Resolve provider if not supplied
@@ -135,9 +161,11 @@
     provider <- resolve_provider_for_model(model)
   }
 
-  result <- try(
+  result <- NULL
+  chat_err <- NULL
+  tryCatch(
     {
-      tidyllm::llm_message(prompt) |>
+      result <- tidyllm::llm_message(prompt) |>
         tidyllm::chat(
           .provider = provider,
           .model = model,
@@ -146,10 +174,16 @@
           .max_tries = max_tries
         )
     },
-    silent = TRUE
+    error = function(e) {
+      chat_err <<- conditionMessage(e)
+    }
   )
 
-  if (inherits(result, "try-error")) {
+  if (!is.null(chat_err)) {
+    message(sprintf(
+      "LLM request failed (model=%s, title=\"%s\", author=\"%s\"): %s",
+      model, as.character(title), as.character(author), chat_err
+    ))
     return(tibble::tibble(
       original_title = NA_character_,
       original_publication_year = as.integer(NA),
@@ -198,7 +232,7 @@
 augment_metadata_with_original <- function(
     df,
     provider = NULL,
-    model = "deepseek-chat",
+    model = "gemini-2.5-pro",
     include_confidence = TRUE,
     temperature = 0.1,
     max_tries = 3,
@@ -277,7 +311,13 @@
         pubDate = input_cols$pubDate[i],
         ISBN = input_cols$ISBN[i]
       ),
-      error = function(e) empty_res
+      error = function(e) {
+        message(sprintf(
+          "Row %d failed (model=%s, title=\"%s\", author=\"%s\"): %s",
+          i, model, as.character(input_cols$title[i]), as.character(input_cols$author[i]), conditionMessage(e)
+        ))
+        empty_res
+      }
     )
     if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
       res <- empty_res
@@ -305,13 +345,13 @@
 }
 
 # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
-df_aug <- augment_metadata_with_original(df, model = "deepseek")
+df_aug <- augment_metadata_with_original(df, model = "deepseek-chat")
 
 #  # OpenAI
 # df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
 
 #  # Claude
-#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
+# df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
 
 #  # Gemini
 #  df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
commit	1bfd8304d6da39511bb73cbd57b9d1e04ae2a5cf	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Sep 25 21:14:45 2025 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Sep 26 09:13:50 2025 +0200
tree	d23bacf7fdfecdd54d8034f240a0183db05e1517
parent	43bb93ff2bee7ff43428f99bb64decd843bd44f7 [diff]