Add show prompt option
diff --git a/scripts/sample_origin_trace_duckdb.R b/scripts/sample_origin_trace_duckdb.R
index 29d588a..7223036 100755
--- a/scripts/sample_origin_trace_duckdb.R
+++ b/scripts/sample_origin_trace_duckdb.R
@@ -17,10 +17,11 @@
id_col = "ISBN", # stable identifier per record
dc_col = "dnb_dc", # column with DC XML
sample_size = "10", # total sample size (default)
- seed = "42", # RNG seed for reproducibility
+ seed = "41", # RNG seed for reproducibility
out = "sample_origin_trace.tsv", # output TSV path
include_consensus = "true", # whether to compute consensus columns
- prompt_source = "text" # text | xml
+ prompt_source = "xml", # text | xml
+ show_prompts = "true" # whether to print prompts to console
)
# Parse key=value CLI args
@@ -42,6 +43,7 @@
opts$out <- Sys.getenv("DELIKO_SAMPLE_OUT", opts$out)
opts$include_consensus <- Sys.getenv("DELIKO_INCLUDE_CONSENSUS", opts$include_consensus)
opts$prompt_source <- Sys.getenv("DELIKO_PROMPT_SOURCE", opts$prompt_source)
+opts$show_prompts <- Sys.getenv("DELIKO_SHOW_PROMPTS", opts$show_prompts)
# Coerce and validate
sample_size <- suppressWarnings(as.integer(opts$sample_size))
@@ -52,6 +54,7 @@
prompt_source <- tolower(opts$prompt_source)
if (!(prompt_source %in% c("text", "xml"))) prompt_source <- "text"
message("Prompt source: ", prompt_source)
+show_prompts <- tolower(opts$show_prompts) %in% c("true", "1", "yes", "y")
if (!file.exists(opts$db)) stop("DuckDB not found: ", opts$db)
@@ -454,6 +457,15 @@
}
sampled$prompt_text <- prompts
+if (isTRUE(show_prompts)) {
+ message(sprintf("Printing %d prompts:", length(prompts)))
+ for (i in seq_along(prompts)) {
+ pid <- if ("id" %in% names(sampled)) as.character(sampled$id[i]) else "<no id>"
+ str <- if ("stratum" %in% names(sampled)) as.character(sampled$stratum[i]) else "<no stratum>"
+ message(sprintf("Prompt [%d/%d] id=%s stratum=%s:\n%s", i, length(prompts), pid, str, prompts[i]))
+ }
+}
+
model_cols <- sampled %>%
mutate(.row = row_number()) %>%
split(.$.row) %>%
@@ -489,6 +501,11 @@
x <- gsub("[[:punct:]]+", "", x)
x[nzchar(x)]
}
+ # For translator names: keep punctuation, only trim and lowercase
+ norm_keep_punct <- function(x) {
+ x <- tolower(trimws(x))
+ x[nzchar(x)]
+ }
get_mode <- function(vals) {
vals <- vals[!is.na(vals) & nzchar(vals)]
if (length(vals) == 0) {
@@ -517,7 +534,7 @@
mutate(
consensus_translator_name = {
vals <- as.character(unlist(c_across(ends_with("_translator_name"))))
- vals <- norm(vals)
+ vals <- norm_keep_punct(vals)
if (length(vals) == 0) NA_character_ else get_mode(vals)
},
consensus_original_title = {