Blame - scripts/originTraceR.R - KorAP/deliko

blob: 06ca031bd0be21210a2560b277b74ded2c4db7a1 [file] [log] [blame]

Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	1	library(RKorAPClient)
				2	library(tidyverse)
				3	library(urltools)
				4	library(tidyllm)
				5
				6	VC <- "author=/Dickens.*/"
				7	deliko <- KorAPConnection("https://korap.dnb.de", verbose = TRUE)
				8	query <- corpusQuery(
				9	deliko,
				10	"<base/s=t>",
				11	# this finds each text once
				12	vc = VC,
				13	fields = c(
				14	"textSigle",
				15	"title",
				16	"subTitle",
				17	"author",
				18	"translator",
				19	"textType",
				20	"textTypeRef",
				21	"publisher",
				22	"pubDate",
				23	"pubPlace",
				24	"ISBN",
				25	"URN"
				26	)
				27	) %>%
				28	fetchNext(maxFetch = 2, randomizePageOrder = TRUE)
				29
				30	df <- query@collectedMatches
				31
				32	df <- df %>%
				33	mutate(
				34	publisher = str_replace_all(publisher, "data:,", ""),
				35	URN = URLdecode(str_replace_all(URN, ".*,", ""))
				36	) %>%
				37	select(-c("matchStart", "matchEnd"))
				38
				39	# write_tsv(df, "deliko_metadata.tsv")
				40
				41	# Build a compact prompt requesting strict JSON for translation/original info
				42	build_translation_prompt <- function(
				43	title,
				44	author = NA_character_,
				45	subTitle = NA_character_,
				46	translator = NA_character_,
				47	publisher = NA_character_,
				48	pubDate = NA_character_,
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	49	ISBN = NA_character_,
				50	prompt_language = c("en", "de")) {
				51	prompt_language <- match.arg(prompt_language)
				52	if (prompt_language == "de") {
				53	fields <- c(
				54	sprintf("Titel: %s", dplyr::coalesce(title, "")),
				55	sprintf("Untertitel: %s", dplyr::coalesce(subTitle, "")),
				56	sprintf("Autor/in: %s", dplyr::coalesce(author, "")),
				57	sprintf("Übersetzer/in: %s", dplyr::coalesce(translator, "")),
				58	sprintf("Verlag: %s", dplyr::coalesce(publisher, "")),
				59	sprintf("Erscheinungsjahr (de): %s", dplyr::coalesce(pubDate, "")),
				60	sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
				61	)
				62	context <- paste(fields, collapse = "\n")
				63	paste0(
				64	"Du bist ein bibliographischer Assistent. Bestimme für das folgende deutschsprachige Buch:",
				65	"\n1) Ob es eine Übersetzung ist.",
				66	"\n2) Falls ja: den Originaltitel und das Jahr der Erstveröffentlichung.",
				67	"\n\nGib ausschließlich ein einzelnes JSON-Objekt zurück – ohne Begleittext – mit exakt diesen Schlüsseln:",
				68	"\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string\|null>,\n \"original_publication_year\": <integer\|null>,\n \"confidence\": <number zwischen 0 und 1>\n}",
				69	"\nNutze null für Unbekanntes. Antworte nur mit JSON.",
				70	"\n\nMetadaten:\n",
				71	context
				72	)
				73	} else {
				74	fields <- c(
				75	sprintf("Title: %s", dplyr::coalesce(title, "")),
				76	sprintf("Subtitle: %s", dplyr::coalesce(subTitle, "")),
				77	sprintf("Author: %s", dplyr::coalesce(author, "")),
				78	sprintf("Translator: %s", dplyr::coalesce(translator, "")),
				79	sprintf("Publisher: %s", dplyr::coalesce(publisher, "")),
				80	sprintf("Publication year (German edition): %s", dplyr::coalesce(pubDate, "")),
				81	sprintf("ISBN: %s", dplyr::coalesce(ISBN, ""))
				82	)
				83	context <- paste(fields, collapse = "\n")
				84	paste0(
				85	"You are a bibliographic assistant. For the following German-language book, determine:",
				86	"\n1) Whether it is a translation.",
				87	"\n2) If yes: the original title and the year of first publication.",
				88	"\n\nReturn a single JSON object only — no prose — with exactly these keys:",
				89	"\n{\n \"is_translation\": <boolean>,\n \"original_title\": <string\|null>,\n \"original_publication_year\": <integer\|null>,\n \"confidence\": <number between 0 and 1>\n}",
				90	"\nUse null for unknown. Output JSON only.",
				91	"\n\nMetadata:\n",
				92	context
				93	)
				94	}
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	95	}
				96
				97	# Resolve tidyllm provider from model prefix
				98	resolve_provider_for_model <- function(model) {
				99	if (grepl("^gpt-", model, ignore.case = TRUE)) {
				100	return(tidyllm::openai())
				101	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
				102	return(tidyllm::claude())
				103	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
				104	return(tidyllm::gemini())
				105	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
				106	return(tidyllm::deepseek())
				107	}
				108	# Default fallback
				109	tidyllm::deepseek()
				110	}
				111
				112	# Friendly API key checks per provider family
				113	ensure_api_key_for_model <- function(model) {
				114	if (grepl("^gpt-", model, ignore.case = TRUE)) {
				115	if (!nzchar(Sys.getenv("OPENAI_API_KEY"))) {
				116	stop("OPENAI_API_KEY is not set. Set it via Sys.setenv('OPENAI_API_KEY'='...') or source your shell env.")
				117	}
				118	} else if (grepl("^claude-", model, ignore.case = TRUE)) {
				119	if (!nzchar(Sys.getenv("ANTHROPIC_API_KEY"))) {
				120	stop("ANTHROPIC_API_KEY is not set. Set it via Sys.setenv('ANTHROPIC_API_KEY'='...') or source your shell env.")
				121	}
				122	} else if (grepl("^gemini-", model, ignore.case = TRUE)) {
				123	if (!nzchar(Sys.getenv("GOOGLE_API_KEY"))) {
				124	stop("GOOGLE_API_KEY is not set. Set it via Sys.setenv('GOOGLE_API_KEY'='...') or source your shell env.")
				125	}
				126	} else if (grepl("^deepseek", model, ignore.case = TRUE)) {
				127	if (!nzchar(Sys.getenv("DEEPSEEK_API_KEY"))) {
				128	stop("DEEPSEEK_API_KEY is not set. Set it via Sys.setenv('DEEPSEEK_API_KEY'='...') or source your shell env.")
				129	}
				130	}
				131	invisible(TRUE)
				132	}
				133
				134	# Infer original title and year for a single record via LLM (DeepSeek by default)
				135	infer_original_single <- function(
				136	title,
				137	author = NA_character_,
				138	subTitle = NA_character_,
				139	translator = NA_character_,
				140	publisher = NA_character_,
				141	pubDate = NA_character_,
				142	ISBN = NA_character_,
				143	provider = NULL,
				144	model = "deepseek-chat",
				145	temperature = 0.1,
				146	max_tries = 3,
				147	timeout = 60) {
				148	prompt <- build_translation_prompt(
				149	title = title,
				150	author = author,
				151	subTitle = subTitle,
				152	translator = translator,
				153	publisher = publisher,
				154	pubDate = pubDate,
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	155	ISBN = ISBN,
				156	prompt_language = "en"
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	157	)
				158
				159	# Resolve provider if not supplied
				160	if (is.null(provider)) {
				161	provider <- resolve_provider_for_model(model)
				162	}
				163
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	164	result <- NULL
				165	chat_err <- NULL
				166	tryCatch(
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	167	{
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	168	result <- tidyllm::llm_message(prompt) \|>
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	169	tidyllm::chat(
				170	.provider = provider,
				171	.model = model,
				172	.temperature = temperature,
				173	.timeout = timeout,
				174	.max_tries = max_tries
				175	)
				176	},
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	177	error = function(e) {
				178	chat_err <<- conditionMessage(e)
				179	}
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	180	)
				181
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	182	if (!is.null(chat_err)) {
				183	message(sprintf(
				184	"LLM request failed (model=%s, title=\"%s\", author=\"%s\"): %s",
				185	model, as.character(title), as.character(author), chat_err
				186	))
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	187	return(tibble::tibble(
				188	original_title = NA_character_,
				189	original_publication_year = as.integer(NA),
				190	orig_confidence = as.numeric(NA)
				191	))
				192	}
				193
				194	# Prefer JSON parsing via tidyllm helper; falls back to NA on failure
				195	parsed <- NULL
				196	suppressWarnings({
				197	parsed <- tidyllm::get_reply_data(result)
				198	})
				199
				200	if (is.null(parsed)) {
				201	# Try to extract JSON manually as a very last resort
				202	raw <- tidyllm::get_reply(result)
				203	json_txt <- stringr::str_extract(raw, "\\{[\\s\\S]*\\}")
				204	if (!is.na(json_txt)) {
				205	parsed <- try(jsonlite::fromJSON(json_txt), silent = TRUE)
				206	if (inherits(parsed, "try-error")) parsed <- NULL
				207	}
				208	}
				209
				210	if (is.null(parsed)) {
				211	return(tibble::tibble(
				212	original_title = NA_character_,
				213	original_publication_year = as.integer(NA),
				214	orig_confidence = as.numeric(NA)
				215	))
				216	}
				217
				218	ot <- parsed[["original_title"]]
				219	oy <- suppressWarnings(as.integer(parsed[["original_publication_year"]]))
				220	cf <- suppressWarnings(as.numeric(parsed[["confidence"]]))
				221
				222	if (!is.na(cf)) cf <- max(0, min(1, cf))
				223
				224	tibble::tibble(
				225	original_title = dplyr::if_else(is.na(ot) \| is.null(ot), NA_character_, as.character(ot)),
				226	original_publication_year = dplyr::if_else(is.na(oy), as.integer(NA), as.integer(oy)),
				227	orig_confidence = dplyr::if_else(is.na(cf), as.numeric(NA), as.numeric(cf))
				228	)
				229	}
				230
				231	# Public API: augment an existing metadata df with original title/year (+ optional confidence)
				232	augment_metadata_with_original <- function(
				233	df,
				234	provider = NULL,
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	235	model = "gemini-2.5-pro",
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	236	include_confidence = TRUE,
				237	temperature = 0.1,
				238	max_tries = 3,
				239	timeout = 60,
				240	pause_seconds = 0,
				241	check_api_key = TRUE,
				242	show_progress = TRUE,
				243	progress_style = 3) {
				244	stopifnot(is.data.frame(df))
				245
				246	needed <- c("title", "author", "subTitle", "translator", "publisher", "pubDate", "ISBN")
				247	missing <- setdiff(needed, names(df))
				248	if (length(missing) > 0) {
				249	stop(sprintf("Missing required columns: %s", paste(missing, collapse = ", ")))
				250	}
				251
				252	# Friendly pre-check for required API key depending on the chosen model/provider
				253	if (isTRUE(check_api_key)) {
				254	ensure_api_key_for_model(model)
				255	}
				256
				257	# Resolve provider if not supplied
				258	if (is.null(provider)) {
				259	provider <- resolve_provider_for_model(model)
				260	}
				261
				262	# Prepare inputs and iterate using purrr::pmap_dfr to avoid NSE warnings
				263	input_cols <- tibble::tibble(
				264	title = df[["title"]],
				265	author = df[["author"]],
				266	subTitle = df[["subTitle"]],
				267	translator = df[["translator"]],
				268	publisher = df[["publisher"]],
				269	pubDate = df[["pubDate"]],
				270	ISBN = df[["ISBN"]]
				271	)
				272
				273	infer_wrapper <- function(title, author, subTitle, translator, publisher, pubDate, ISBN) {
				274	res <- infer_original_single(
				275	title = title,
				276	author = author,
				277	subTitle = subTitle,
				278	translator = translator,
				279	publisher = publisher,
				280	pubDate = pubDate,
				281	ISBN = ISBN,
				282	provider = provider,
				283	model = model,
				284	temperature = temperature,
				285	max_tries = max_tries,
				286	timeout = timeout
				287	)
				288	if (pause_seconds > 0) Sys.sleep(pause_seconds)
				289	res
				290	}
				291
				292	n <- nrow(input_cols)
				293	results_list <- vector("list", n)
				294	empty_res <- tibble::tibble(
				295	original_title = NA_character_,
				296	original_publication_year = as.integer(NA),
				297	orig_confidence = as.numeric(NA)
				298	)
				299	pb <- NULL
				300	if (isTRUE(show_progress) && n > 0) {
				301	pb <- utils::txtProgressBar(min = 0, max = n, style = progress_style)
				302	}
				303	for (i in seq_len(n)) {
				304	res <- tryCatch(
				305	infer_wrapper(
				306	title = input_cols$title[i],
				307	author = input_cols$author[i],
				308	subTitle = input_cols$subTitle[i],
				309	translator = input_cols$translator[i],
				310	publisher = input_cols$publisher[i],
				311	pubDate = input_cols$pubDate[i],
				312	ISBN = input_cols$ISBN[i]
				313	),
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	314	error = function(e) {
				315	message(sprintf(
				316	"Row %d failed (model=%s, title=\"%s\", author=\"%s\"): %s",
				317	i, model, as.character(input_cols$title[i]), as.character(input_cols$author[i]), conditionMessage(e)
				318	))
				319	empty_res
				320	}
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	321	)
				322	if (is.null(res) \|\| !is.data.frame(res) \|\| nrow(res) < 1) {
				323	res <- empty_res
				324	} else if (nrow(res) > 1) {
				325	res <- res[1, , drop = FALSE]
				326	}
				327	results_list[[i]] <- res
				328	if (!is.null(pb)) utils::setTxtProgressBar(pb, i)
				329	}
				330	if (!is.null(pb)) close(pb)
				331	results_tbl <- dplyr::bind_rows(results_list)
				332	if (nrow(results_tbl) != n) {
				333	# Ensure row count aligns with df to avoid recycling errors
				334	if (nrow(results_tbl) < n) {
				335	pad <- dplyr::bind_rows(rep(list(empty_res), n - nrow(results_tbl)))
				336	results_tbl <- dplyr::bind_rows(results_tbl, pad)
				337	} else if (nrow(results_tbl) > n) {
				338	results_tbl <- dplyr::slice(results_tbl, seq_len(n))
				339	}
				340	}
				341
				342	out <- dplyr::bind_cols(df, results_tbl)
				343	if (!include_confidence) out <- dplyr::select(out, -"orig_confidence")
				344	out
				345	}
				346
				347	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	348	df_aug <- augment_metadata_with_original(df, model = "deepseek-chat")
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	349
				350	# # OpenAI
				351	# df_aug <- augment_metadata_with_original(df, model = "gpt-4o-mini")
				352
				353	# # Claude
Marc Kupietz	1bfd830	2025-09-25 21:14:45 +0200	[diff] [blame]	354	# df_aug <- augment_metadata_with_original(df, model = "claude-3-5-sonnet-latest")
Marc Kupietz	43bb93f	2025-09-25 20:59:14 +0200	[diff] [blame]	355
				356	# # Gemini
				357	# df_aug <- augment_metadata_with_original(df, model = "gemini-2.5-pro")