Blame - scripts/originTraceR.R - KorAP/deliko

blob: f064fa5a9eba23aa986ac374b4f6d8d700fc26a2 [file] [log] [blame]

Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame^]	1	library(RKorAPClient)
				2	library(tidyverse)
				3	library(urltools)
				4	library(tidyllm)
				5
				6	VC <- "author=/Dickens.*/"
				7	deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
				8	query <- corpusQuery(
				9	deliko,
				10	"<base/s=t>",
				11	# this finds each text once
				12	vc = VC,
				13	fields = c(
				14	"textSigle",
				15	"title",
				16	"subTitle",
				17	"author",
				18	"translator",
				19	"textType",
				20	"textTypeRef",
				21	"publisher",
				22	"pubDate",
				23	"pubPlace",
				24	"ISBN",
				25	"URN"
				26	)
				27	) %>%
				28	fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
				29
				30	df <- query@collectedMatches
				31
				32	df <- df %>%
				33	mutate(
				34	publisher = str_replace_all(publisher, "data:,", ""),
				35	URN = URLdecode(str_replace_all(URN, ".*,", ""))
				36	) %>%
				37	select(-c("matchStart", "matchEnd"))
				38
				39	# write_tsv(df, "deliko_metadata.tsv")
				40
				41	# Build a compact prompt requesting strict JSON for translation/original info
				42	build_translation_prompt <- function(
				43	title,
				44	author = NA_character_,
				45	subTitle = NA_character_,
				46	translator = NA_character_,
				47	publisher = NA_character_,
				48	pubDate = NA_character_,
				49	ISBN = NA_character_) {
				50	fields <- c(
				51	sprintf("Titel: %s", dplyr::coalesce(title, "")),
				52	sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
				53	sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
				54	sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
				55	sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
				56	sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
				57	sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
				58	)
				59	context <- paste(fields, collapse = "\n")
				60	paste0(
				61	"Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
				62	"\n1) Ob es eine Übersetzung ist.",
				63	"\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
				64	"\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
				65	"\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string\|null>,\n \"original_publication_year\": <integer\|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
				66	"\nNutze null für Unbekanntes. Antworte nur mit JSON.",
				67	"\n\nMetadaten:\n",
				68	context
				69	)
				70	}
				71
				72	# Resolve tidyllm provider from model prefix
				73	resolve_provider_for_model <- function(model) {
				74	if (grepl("^gpt-", model, ignore.case = TRUE)) {
				75	return(tidyllm::openai())
				76	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
				77	return(tidyllm::claude())
				78	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
				79	return(tidyllm::gemini())
				80	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
				81	return(tidyllm::deepseek())
				82	}
				83	# Default fallback
				84	tidyllm::deepseek()
				85	}
				86
				87	# Friendly API key checks per provider family
				88	ensure_api_key_for_model <- function(model) {
				89	if (grepl("^gpt-", model, ignore.case = TRUE)) {
				90	if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
				91	stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
				92	}
				93	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
				94	if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
				95	stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
				96	}
				97	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
				98	if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
				99	stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
				100	}
				101	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
				102	if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
				103	stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
				104	}
				105	}
				106	invisible(TRUE)
				107	}
				108
				109	# Infer original title and year for a single record via LLM (DeepSeek by default)
				110	infer_original_single <- function(
				111	title,
				112	author = NA_character_,
				113	subTitle = NA_character_,
				114	translator = NA_character_,
				115	publisher = NA_character_,
				116	pubDate = NA_character_,
				117	ISBN = NA_character_,
				118	provider = NULL,
				119	model = "deepseek-chat",
				120	temperature = 0.1,
				121	max_tries = 3,
				122	timeout = 60) {
				123	prompt <- build_translation_prompt(
				124	title = title,
				125	author = author,
				126	subTitle = subTitle,
				127	translator = translator,
				128	publisher = publisher,
				129	pubDate = pubDate,
				130	ISBN = ISBN
				131	)
				132
				133	# Resolve provider if not supplied
				134	if (is.null(provider)) {
				135	provider <- resolve_provider_for_model(model)
				136	}
				137
				138	result <- try(
				139	{
				140	tidyllm::llm_message(prompt) \|>
				141	tidyllm::chat(
				142	.provider = provider,
				143	.model = model,
				144	.temperature = temperature,
				145	.timeout = timeout,
				146	.max_tries = max_tries
				147	)
				148	},
				149	silent = TRUE
				150	)
				151
				152	if (inherits(result, "try-error")) {
				153	return(tibble::tibble(
				154	original_title = NA_character_,
				155	original_publication_year = as.integer(NA),
				156	orig_confidence = as.numeric(NA)
				157	))
				158	}
				159
				160	# Prefer JSON parsing via tidyllm helper; falls back to NA on failure
				161	parsed <- NULL
				162	suppressWarnings({
				163	parsed <- tidyllm::get_reply_data(result)
				164	})
				165
				166	if (is.null(parsed)) {
				167	# Try to extract JSON manually as a very last resort
				168	raw <- tidyllm::get_reply(result)
				169	json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
				170	if (!is.na(json_txt)) {
				171	parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
				172	if (inherits(parsed, "try-error")) parsed <- NULL
				173	}
				174	}
				175
				176	if (is.null(parsed)) {
				177	return(tibble::tibble(
				178	original_title = NA_character_,
				179	original_publication_year = as.integer(NA),
				180	orig_confidence = as.numeric(NA)
				181	))
				182	}
				183
				184	ot <- parsed[["original_title"]]
				185	oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
				186	cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
				187
				188	if (!is.na(cf)) cf <- max(0, min(1, cf))
				189
				190	tibble::tibble(
				191	original_title = dplyr::if_else(is.na(ot) \| is.null(ot), NA_character_, as.character(ot)),
				192	original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
				193	orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
				194	)
				195	}
				196
				197	# Public API: augment an existing metadata df with original title/year (+ optional confidence)
				198	augment_metadata_with_original <- function(
				199	df,
				200	provider = NULL,
				201	model = "deepseek-chat",
				202	include_confidence = TRUE,
				203	temperature = 0.1,
				204	max_tries = 3,
				205	timeout = 60,
				206	pause_seconds = 0,
				207	check_api_key = TRUE,
				208	show_progress = TRUE,
				209	progress_style = 3) {
				210	stopifnot(is.data.frame(df))
				211
				212	needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
				213	missing <- setdiff(needed, names(df))
				214	if (length(missing) > 0) {
				215	stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
				216	}
				217
				218	# Friendly pre-check for required API key depending on the chosen model/provider
				219	if (isTRUE(check_api_key)) {
				220	ensure_api_key_for_model(model)
				221	}
				222
				223	# Resolve provider if not supplied
				224	if (is.null(provider)) {
				225	provider <- resolve_provider_for_model(model)
				226	}
				227
				228	# Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
				229	input_cols <- tibble::tibble(
				230	title = df[["title"]],
				231	author = df[["author"]],
				232	subTitle = df[["subTitle"]],
				233	translator = df[["translator"]],
				234	publisher = df[["publisher"]],
				235	pubDate = df[["pubDate"]],
				236	ISBN = df[["ISBN"]]
				237	)
				238
				239	infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
				240	res <- infer_original_single(
				241	title = title,
				242	author = author,
				243	subTitle = subTitle,
				244	translator = translator,
				245	publisher = publisher,
				246	pubDate = pubDate,
				247	ISBN = ISBN,
				248	provider = provider,
				249	model = model,
				250	temperature = temperature,
				251	max_tries = max_tries,
				252	timeout = timeout
				253	)
				254	if (pause_seconds > 0) Sys.sleep(pause_seconds)
				255	res
				256	}
				257
				258	n <- nrow(input_cols)
				259	results_list <- vector("list", n)
				260	empty_res <- tibble::tibble(
				261	original_title = NA_character_,
				262	original_publication_year = as.integer(NA),
				263	orig_confidence = as.numeric(NA)
				264	)
				265	pb <- NULL
				266	if (isTRUE(show_progress) && n > 0) {
				267	pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
				268	}
				269	for (i in seq_len(n)) {
				270	res <- tryCatch(
				271	infer_wrapper(
				272	title = input_cols$title[i],
				273	author = input_cols$author[i],
				274	subTitle = input_cols$subTitle[i],
				275	translator = input_cols$translator[i],
				276	publisher = input_cols$publisher[i],
				277	pubDate = input_cols$pubDate[i],
				278	ISBN = input_cols$ISBN[i]
				279	),
				280	error = function(e) empty_res
				281	)
				282	if (is.null(res) \|\| !is.data.frame(res) \|\| nrow(res) < 1) {
				283	res <- empty_res
				284	} else if (nrow(res) > 1) {
				285	res <- res[1, , drop = FALSE]
				286	}
				287	results_list[[i]] <- res
				288	if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
				289	}
				290	if (!is.null(pb)) close(pb)
				291	results_tbl <- dplyr::bind_rows(results_list)
				292	if (nrow(results_tbl) != n) {
				293	# Ensure row count aligns with df to avoid recycling errors
				294	if (nrow(results_tbl) < n) {
				295	pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
				296	results_tbl <- dplyr::bind_rows(results_tbl, pad)
				297	} else if (nrow(results_tbl) > n) {
				298	results_tbl <- dplyr::slice(results_tbl, seq_len(n))
				299	}
				300	}
				301
				302	out <- dplyr::bind_cols(df, results_tbl)
				303	if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
				304	out
				305	}
				306
				307	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
				308	df_aug <- augment_metadata_with_original(df, model = "deepseek")
				309
				310	# # OpenAI
				311	# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
				312
				313	# # Claude
				314	#df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
				315
				316	# # Gemini
				317	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")