scripts/originTraceR.R - KorAP/deliko - Gitiles

 library(RKorAPClient)
 library(tidyverse)
 library(urltools)
 library(tidyllm)

 VC <- "author=/Dickens.*/"
 deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
 query <- corpusQuery(
   deliko,
   "<base/s=t>",
   # this finds each text once
   vc = VC,
   fields = c(
     "textSigle",
     "title",
     "subTitle",
     "author",
     "translator",
     "textType",
     "textTypeRef",
     "publisher",
     "pubDate",
     "pubPlace",
     "ISBN",
     "URN"
   )
 ) %>%
   fetchNext(maxFetch = 2, randomizePageOrder = TRUE)

 df <- query@collectedMatches

 df <- df %>%
   mutate(
     publisher = str_replace_all(publisher, "data:,", ""),
     URN = URLdecode(str_replace_all(URN, ".*,", ""))
   ) %>%
   select(-c("matchStart", "matchEnd"))

 # write_tsv(df, "deliko_metadata.tsv")

 # Build a compact prompt requesting strict JSON for translation/original info
 build_translation_prompt <- function(
     title,
     author = NA_character_,
     subTitle = NA_character_,
     translator = NA_character_,
     publisher = NA_character_,
     pubDate = NA_character_,
     ISBN = NA_character_) {
   fields <- c(
     sprintf("Titel: %s", dplyr::coalesce(title, "")),
     sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
     sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
     sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
     sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
     sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
     sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
   )
   context <- paste(fields, collapse = "\n")
   paste0(
     "Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
     "\n1) Ob es eine Übersetzung ist.",
     "\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
     "\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
     "\n{\n  \"is_translation\": <boolean>,\n  \"original_title\": <string|null>,\n  \"original_publication_year\": <integer|null>,\n  \"confidence\": <number zwischen 0 und 1>\n}",
     "\nNutze null für Unbekanntes. Antworte nur mit JSON.",
     "\n\nMetadaten:\n",
     context
   )
 }

 # Resolve tidyllm provider from model prefix
 resolve_provider_for_model <- function(model) {
   if (grepl("^gpt-", model, ignore.case = TRUE)) {
     return(tidyllm::openai())
   } else if (grepl("^claude-", model, ignore.case = TRUE)) {
     return(tidyllm::claude())
   } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
     return(tidyllm::gemini())
   } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
     return(tidyllm::deepseek())
   }
   # Default fallback
   tidyllm::deepseek()
 }

 # Friendly API key checks per provider family
 ensure_api_key_for_model <- function(model) {
   if (grepl("^gpt-", model, ignore.case = TRUE)) {
     if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
       stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
     }
   } else if (grepl("^claude-", model, ignore.case = TRUE)) {
     if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
       stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
     }
   } else if (grepl("^gemini-", model, ignore.case = TRUE)) {
     if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
       stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
     }
   } else if (grepl("^deepseek", model, ignore.case = TRUE)) {
     if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
       stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
     }
   }
   invisible(TRUE)
 }

 # Infer original title and year for a single record via LLM (DeepSeek by default)
 infer_original_single <- function(
     title,
     author = NA_character_,
     subTitle = NA_character_,
     translator = NA_character_,
     publisher = NA_character_,
     pubDate = NA_character_,
     ISBN = NA_character_,
     provider = NULL,
     model = "deepseek-chat",
     temperature = 0.1,
     max_tries = 3,
     timeout = 60) {
   prompt <- build_translation_prompt(
     title = title,
     author = author,
     subTitle = subTitle,
     translator = translator,
     publisher = publisher,
     pubDate = pubDate,
     ISBN = ISBN
   )

   # Resolve provider if not supplied
   if (is.null(provider)) {
     provider <- resolve_provider_for_model(model)
   }

   result <- try(
     {
       tidyllm::llm_message(prompt) |>
         tidyllm::chat(
           .provider = provider,
           .model = model,
           .temperature = temperature,
           .timeout = timeout,
           .max_tries = max_tries
         )
     },
     silent = TRUE
   )

   if (inherits(result, "try-error")) {
     return(tibble::tibble(
       original_title = NA_character_,
       original_publication_year = as.integer(NA),
       orig_confidence = as.numeric(NA)
     ))
   }

   # Prefer JSON parsing via tidyllm helper; falls back to NA on failure
   parsed <- NULL
   suppressWarnings({
     parsed <- tidyllm::get_reply_data(result)
   })

   if (is.null(parsed)) {
     # Try to extract JSON manually as a very last resort
     raw <- tidyllm::get_reply(result)
     json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
     if (!is.na(json_txt)) {
       parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
       if (inherits(parsed, "try-error")) parsed <- NULL
     }
   }

   if (is.null(parsed)) {
     return(tibble::tibble(
       original_title = NA_character_,
       original_publication_year = as.integer(NA),
       orig_confidence = as.numeric(NA)
     ))
   }

   ot <- parsed[["original_title"]]
   oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
   cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))

   if (!is.na(cf)) cf <- max(0, min(1, cf))

   tibble::tibble(
     original_title = dplyr::if_else(is.na(ot) | is.null(ot), NA_character_, as.character(ot)),
     original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
     orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
   )
 }

 # Public API: augment an existing metadata df with original title/year (+ optional confidence)
 augment_metadata_with_original <- function(
     df,
     provider = NULL,
     model = "deepseek-chat",
     include_confidence = TRUE,
     temperature = 0.1,
     max_tries = 3,
     timeout = 60,
     pause_seconds = 0,
     check_api_key = TRUE,
     show_progress = TRUE,
     progress_style = 3) {
   stopifnot(is.data.frame(df))

   needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
   missing <- setdiff(needed, names(df))
   if (length(missing) > 0) {
     stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
   }

   # Friendly pre-check for required API key depending on the chosen model/provider
   if (isTRUE(check_api_key)) {
     ensure_api_key_for_model(model)
   }

   # Resolve provider if not supplied
   if (is.null(provider)) {
     provider <- resolve_provider_for_model(model)
   }

   # Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
   input_cols <- tibble::tibble(
     title = df[["title"]],
     author = df[["author"]],
     subTitle = df[["subTitle"]],
     translator = df[["translator"]],
     publisher = df[["publisher"]],
     pubDate = df[["pubDate"]],
     ISBN = df[["ISBN"]]
   )

   infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
     res <- infer_original_single(
       title = title,
       author = author,
       subTitle = subTitle,
       translator = translator,
       publisher = publisher,
       pubDate = pubDate,
       ISBN = ISBN,
       provider = provider,
       model = model,
       temperature = temperature,
       max_tries = max_tries,
       timeout = timeout
     )
     if (pause_seconds > 0) Sys.sleep(pause_seconds)
     res
   }

   n <- nrow(input_cols)
   results_list <- vector("list", n)
   empty_res <- tibble::tibble(
     original_title = NA_character_,
     original_publication_year = as.integer(NA),
     orig_confidence = as.numeric(NA)
   )
   pb <- NULL
   if (isTRUE(show_progress) && n > 0) {
     pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
   }
   for (i in seq_len(n)) {
     res <- tryCatch(
       infer_wrapper(
         title = input_cols$title[i],
         author = input_cols$author[i],
         subTitle = input_cols$subTitle[i],
         translator = input_cols$translator[i],
         publisher = input_cols$publisher[i],
         pubDate = input_cols$pubDate[i],
         ISBN = input_cols$ISBN[i]
       ),
       error = function(e) empty_res
     )
     if (is.null(res) || !is.data.frame(res) || nrow(res) < 1) {
       res <- empty_res
     } else if (nrow(res) > 1) {
       res <- res[1, , drop = FALSE]
     }
     results_list[[i]] <- res
     if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
   }
   if (!is.null(pb)) close(pb)
   results_tbl <- dplyr::bind_rows(results_list)
   if (nrow(results_tbl) != n) {
     # Ensure row count aligns with df to avoid recycling errors
     if (nrow(results_tbl) < n) {
       pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
       results_tbl <- dplyr::bind_rows(results_tbl, pad)
     } else if (nrow(results_tbl) > n) {
       results_tbl <- dplyr::slice(results_tbl, seq_len(n))
     }
   }

   out <- dplyr::bind_cols(df, results_tbl)
   if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
   out
 }

 # df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
 df_aug <- augment_metadata_with_original(df, model = "deepseek")

 #  # OpenAI
 # df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")

 #  # Claude
 #df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")

 #  # Gemini
 #  df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
	library(RKorAPClient)
	library(tidyverse)
	library(urltools)
	library(tidyllm)

	VC <- "author=/Dickens.*/"
	deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
	query <- corpusQuery(
	deliko,
	"<base/s=t>",
	# this finds each text once
	vc = VC,
	fields = c(
	"textSigle",
	"title",
	"subTitle",
	"author",
	"translator",
	"textType",
	"textTypeRef",
	"publisher",
	"pubDate",
	"pubPlace",
	"ISBN",
	"URN"
	)
	) %>%
	fetchNext(maxFetch = 2, randomizePageOrder = TRUE)

	df <- query@collectedMatches

	df <- df %>%
	mutate(
	publisher = str_replace_all(publisher, "data:,", ""),
	URN = URLdecode(str_replace_all(URN, ".*,", ""))
	) %>%
	select(-c("matchStart", "matchEnd"))

	# write_tsv(df, "deliko_metadata.tsv")

	# Build a compact prompt requesting strict JSON for translation/original info
	build_translation_prompt <- function(
	title,
	author = NA_character_,
	subTitle = NA_character_,
	translator = NA_character_,
	publisher = NA_character_,
	pubDate = NA_character_,
	ISBN = NA_character_) {
	fields <- c(
	sprintf("Titel: %s", dplyr::coalesce(title, "")),
	sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
	sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
	sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
	sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
	sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
	sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
	)
	context <- paste(fields, collapse = "\n")
	paste0(
	"Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
	"\n1) Ob es eine Übersetzung ist.",
	"\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
	"\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
	"\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string\|null>,\n \"original_publication_year\": <integer\|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
	"\nNutze null für Unbekanntes. Antworte nur mit JSON.",
	"\n\nMetadaten:\n",
	context
	)
	}

	# Resolve tidyllm provider from model prefix
	resolve_provider_for_model <- function(model) {
	if (grepl("^gpt-", model, ignore.case = TRUE)) {
	return(tidyllm::openai())
	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
	return(tidyllm::claude())
	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
	return(tidyllm::gemini())
	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
	return(tidyllm::deepseek())
	}
	# Default fallback
	tidyllm::deepseek()
	}

	# Friendly API key checks per provider family
	ensure_api_key_for_model <- function(model) {
	if (grepl("^gpt-", model, ignore.case = TRUE)) {
	if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
	stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
	}
	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
	if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
	stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
	}
	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
	if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
	stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
	}
	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
	if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
	stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
	}
	}
	invisible(TRUE)
	}

	# Infer original title and year for a single record via LLM (DeepSeek by default)
	infer_original_single <- function(
	title,
	author = NA_character_,
	subTitle = NA_character_,
	translator = NA_character_,
	publisher = NA_character_,
	pubDate = NA_character_,
	ISBN = NA_character_,
	provider = NULL,
	model = "deepseek-chat",
	temperature = 0.1,
	max_tries = 3,
	timeout = 60) {
	prompt <- build_translation_prompt(
	title = title,
	author = author,
	subTitle = subTitle,
	translator = translator,
	publisher = publisher,
	pubDate = pubDate,
	ISBN = ISBN
	)

	# Resolve provider if not supplied
	if (is.null(provider)) {
	provider <- resolve_provider_for_model(model)
	}

	result <- try(
	{
	tidyllm::llm_message(prompt) \|>
	tidyllm::chat(
	.provider = provider,
	.model = model,
	.temperature = temperature,
	.timeout = timeout,
	.max_tries = max_tries
	)
	},
	silent = TRUE
	)

	if (inherits(result, "try-error")) {
	return(tibble::tibble(
	original_title = NA_character_,
	original_publication_year = as.integer(NA),
	orig_confidence = as.numeric(NA)
	))
	}

	# Prefer JSON parsing via tidyllm helper; falls back to NA on failure
	parsed <- NULL
	suppressWarnings({
	parsed <- tidyllm::get_reply_data(result)
	})

	if (is.null(parsed)) {
	# Try to extract JSON manually as a very last resort
	raw <- tidyllm::get_reply(result)
	json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
	if (!is.na(json_txt)) {
	parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
	if (inherits(parsed, "try-error")) parsed <- NULL
	}
	}

	if (is.null(parsed)) {
	return(tibble::tibble(
	original_title = NA_character_,
	original_publication_year = as.integer(NA),
	orig_confidence = as.numeric(NA)
	))
	}

	ot <- parsed[["original_title"]]
	oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
	cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))

	if (!is.na(cf)) cf <- max(0, min(1, cf))

	tibble::tibble(
	original_title = dplyr::if_else(is.na(ot) \| is.null(ot), NA_character_, as.character(ot)),
	original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
	orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
	)
	}

	# Public API: augment an existing metadata df with original title/year (+ optional confidence)
	augment_metadata_with_original <- function(
	df,
	provider = NULL,
	model = "deepseek-chat",
	include_confidence = TRUE,
	temperature = 0.1,
	max_tries = 3,
	timeout = 60,
	pause_seconds = 0,
	check_api_key = TRUE,
	show_progress = TRUE,
	progress_style = 3) {
	stopifnot(is.data.frame(df))

	needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
	missing <- setdiff(needed, names(df))
	if (length(missing) > 0) {
	stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
	}

	# Friendly pre-check for required API key depending on the chosen model/provider
	if (isTRUE(check_api_key)) {
	ensure_api_key_for_model(model)
	}

	# Resolve provider if not supplied
	if (is.null(provider)) {
	provider <- resolve_provider_for_model(model)
	}

	# Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
	input_cols <- tibble::tibble(
	title = df[["title"]],
	author = df[["author"]],
	subTitle = df[["subTitle"]],
	translator = df[["translator"]],
	publisher = df[["publisher"]],
	pubDate = df[["pubDate"]],
	ISBN = df[["ISBN"]]
	)

	infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
	res <- infer_original_single(
	title = title,
	author = author,
	subTitle = subTitle,
	translator = translator,
	publisher = publisher,
	pubDate = pubDate,
	ISBN = ISBN,
	provider = provider,
	model = model,
	temperature = temperature,
	max_tries = max_tries,
	timeout = timeout
	)
	if (pause_seconds > 0) Sys.sleep(pause_seconds)
	res
	}

	n <- nrow(input_cols)
	results_list <- vector("list", n)
	empty_res <- tibble::tibble(
	original_title = NA_character_,
	original_publication_year = as.integer(NA),
	orig_confidence = as.numeric(NA)
	)
	pb <- NULL
	if (isTRUE(show_progress) && n > 0) {
	pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
	}
	for (i in seq_len(n)) {
	res <- tryCatch(
	infer_wrapper(
	title = input_cols$title[i],
	author = input_cols$author[i],
	subTitle = input_cols$subTitle[i],
	translator = input_cols$translator[i],
	publisher = input_cols$publisher[i],
	pubDate = input_cols$pubDate[i],
	ISBN = input_cols$ISBN[i]
	),
	error = function(e) empty_res
	)
	if (is.null(res) \|\| !is.data.frame(res) \|\| nrow(res) < 1) {
	res <- empty_res
	} else if (nrow(res) > 1) {
	res <- res[1, , drop = FALSE]
	}
	results_list[[i]] <- res
	if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
	}
	if (!is.null(pb)) close(pb)
	results_tbl <- dplyr::bind_rows(results_list)
	if (nrow(results_tbl) != n) {
	# Ensure row count aligns with df to avoid recycling errors
	if (nrow(results_tbl) < n) {
	pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
	results_tbl <- dplyr::bind_rows(results_tbl, pad)
	} else if (nrow(results_tbl) > n) {
	results_tbl <- dplyr::slice(results_tbl, seq_len(n))
	}
	}

	out <- dplyr::bind_cols(df, results_tbl)
	if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
	out
	}

	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
	df_aug <- augment_metadata_with_original(df, model = "deepseek")

	# # OpenAI
	# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")

	# # Claude
	#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")

	# # Gemini
	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")