| Marc Kupietz | 9b52692 | 2025-09-28 20:42:34 +0200 | [diff] [blame] | 1 | #!/usr/bin/env Rscript |
| 2 | |
| 3 | suppressPackageStartupMessages({ |
| 4 | library(DBI) |
| 5 | library(duckdb) |
| 6 | library(tibble) |
| 7 | }) |
| 8 | |
| 9 | # Public function: safe to source in RStudio |
| 10 | # Returns a tibble from DuckDB using either a table name or a SQL query |
| 11 | duckdb2tibble <- function( |
| 12 | db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb", |
| 13 | table = "deliko_metadata", |
| 14 | sql = "", |
| 15 | limit = 0, |
| 16 | read_only = TRUE) { |
| 17 | stopifnot(is.character(db), length(db) == 1) |
| 18 | if (!file.exists(db)) stop("DuckDB not found: ", db) |
| 19 | |
| 20 | drv <- duckdb::duckdb() |
| 21 | con <- DBI::dbConnect(drv, dbdir = db, read_only = isTRUE(read_only)) |
| 22 | on.exit( |
| 23 | { |
| 24 | try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE) |
| 25 | try(duckdb::duckdb_shutdown(drv), silent = TRUE) |
| 26 | }, |
| 27 | add = TRUE |
| 28 | ) |
| 29 | |
| 30 | if (nzchar(sql)) { |
| 31 | res <- DBI::dbGetQuery(con, sql) |
| 32 | return(tibble::as_tibble(res)) |
| 33 | } |
| 34 | |
| 35 | q_schema <- DBI::dbQuoteIdentifier(con, "main") |
| 36 | q_table <- DBI::dbQuoteIdentifier(con, table) |
| 37 | fq_name <- paste0(as.character(q_schema), ".", as.character(q_table)) |
| 38 | q <- paste0("SELECT * FROM ", fq_name) |
| 39 | if (is.numeric(limit) && limit > 0) q <- paste0(q, " LIMIT ", as.integer(limit)) |
| 40 | |
| 41 | tibble::as_tibble(DBI::dbGetQuery(con, q)) |
| 42 | } |
| 43 | |
| 44 | # CLI mode only: run when invoked via Rscript |
| 45 | if (!interactive()) { |
| 46 | defaults <- list( |
| 47 | db = "/home/kupietz/korap4dnb/epub2i5/data/deliko_metadata.duckdb", |
| 48 | table = "deliko_metadata", |
| 49 | sql = "", |
| 50 | limit = "0", |
| 51 | output = "" |
| 52 | ) |
| 53 | opts <- defaults |
| 54 | args <- commandArgs(trailingOnly = TRUE) |
| 55 | for (a in args) { |
| 56 | if (!grepl("=", a, fixed = TRUE)) next |
| 57 | kv <- strsplit(a, "=", fixed = TRUE)[[1]] |
| 58 | if (length(kv) == 2 && kv[1] %in% names(opts)) opts[[kv[1]]] <- kv[2] |
| 59 | } |
| 60 | |
| 61 | # ENV overrides |
| 62 | opts$db <- Sys.getenv("DELIKO_DUCKDB", opts$db) |
| 63 | opts$table <- Sys.getenv("DELIKO_TABLE", opts$table) |
| 64 | opts$sql <- Sys.getenv("DELIKO_SQL", opts$sql) |
| 65 | opts$limit <- Sys.getenv("LIMIT", opts$limit) |
| 66 | opts$output <- Sys.getenv("OUT_RDS", opts$output) |
| 67 | |
| 68 | message("Connecting DuckDB: ", opts$db) |
| 69 | df <- duckdb2tibble( |
| 70 | db = opts$db, |
| 71 | table = opts$table, |
| 72 | sql = opts$sql, |
| 73 | limit = suppressWarnings(as.integer(opts$limit)), |
| 74 | read_only = TRUE |
| 75 | ) |
| 76 | |
| 77 | message("Loaded tibble: ", nrow(df), " rows x ", ncol(df), " cols") |
| 78 | message("Columns: ", paste(names(df), collapse = ", ")) |
| 79 | print(utils::head(df, n = if (nrow(df) > 10) 10 else nrow(df))) |
| 80 | |
| 81 | if (nzchar(opts$output)) { |
| 82 | saveRDS(df, opts$output) |
| 83 | message("Saved tibble to ", opts$output) |
| 84 | } |
| 85 | } |