blob: 490338c337d07d423af7a864cc5229876b51c803 [file] [log] [blame]
#!/usr/bin/env Rscript
suppressPackageStartupMessages({
library(DBI)
library(duckdb)
library(tibble)
})
# Public function: safe to source in RStudio
# Returns a tibble from DuckDB using either a table name or a SQL query
duckdb2tibble <- function(
db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
table = "deliko_metadata",
sql = "",
limit = 0,
read_only = TRUE) {
stopifnot(is.character(db), length(db) == 1)
if (!file.exists(db)) stop("DuckDB not found: ", db)
drv <- duckdb::duckdb()
con <- DBI::dbConnect(drv, dbdir = db, read_only = isTRUE(read_only))
on.exit(
{
try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE)
try(duckdb::duckdb_shutdown(drv), silent = TRUE)
},
add = TRUE
)
if (nzchar(sql)) {
res <- DBI::dbGetQuery(con, sql)
return(tibble::as_tibble(res))
}
q_schema <- DBI::dbQuoteIdentifier(con, "main")
q_table <- DBI::dbQuoteIdentifier(con, table)
fq_name <- paste0(as.character(q_schema), ".", as.character(q_table))
q <- paste0("SELECT * FROM ", fq_name)
if (is.numeric(limit) && limit > 0) q <- paste0(q, " LIMIT ", as.integer(limit))
tibble::as_tibble(DBI::dbGetQuery(con, q))
}
# CLI mode only: run when invoked via Rscript
if (!interactive()) {
defaults <- list(
db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb",
table = "deliko_metadata",
sql = "",
limit = "0",
output = ""
)
opts <- defaults
args <- commandArgs(trailingOnly = TRUE)
for (a in args) {
if (!grepl("=", a, fixed = TRUE)) next
kv <- strsplit(a, "=", fixed = TRUE)[[1]]
if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2]
}
# ENV overrides
opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db)
opts$table <- Sys.getenv("DELIKO_TABLE", opts$table)
opts$sql <- Sys.getenv("DELIKO_SQL", opts$sql)
opts$limit <- Sys.getenv("LIMIT", opts$limit)
opts$output <- Sys.getenv("OUT_RDS", opts$output)
message("Connecting DuckDB: ", opts$db)
df <- duckdb2tibble(
db = opts$db,
table = opts$table,
sql = opts$sql,
limit = suppressWarnings(as.integer(opts$limit)),
read_only = TRUE
)
message("Loaded tibble: ", nrow(df), " rows x ", ncol(df), " cols")
message("Columns: ", paste(names(df), collapse = ", "))
print(utils::head(df, n = if (nrow(df) > 10) 10 else nrow(df)))
if (nzchar(opts$output)) {
saveRDS(df, opts$output)
message("Saved tibble to ", opts$output)
}
}