diff --git a/NAMESPACE b/NAMESPACE index a6b5194..786ac51 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,8 @@ export(.extractAMRtable) export(.updateBVBRCdata) export(CDHIT2duckdb) +export(cleanData) +export(cleanMetaData) export(prepareGenomes) export(retrieveGenomes) export(retrieveMetadata) diff --git a/R/data_curation.R b/R/data_curation.R index 6e71824..b58cf3c 100644 --- a/R/data_curation.R +++ b/R/data_curation.R @@ -1,3 +1,177 @@ +#' Helps ensure trailing 0s are retained in genome IDs for proper downloading +#' @keywords internal +.id_checker <- function(x) { + # Taxon IDs are just numbers, genome IDs have decimals, this tells them apart + grepl("^[0-9]+$", x) +} + +#' Helps tag genomes with their AMR evidence for parsing +#' @keywords internal +.create_amr_tagged_view <- function(con) { + lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution") + lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ") + comp_str_sql <- DBI::dbQuoteString(con, "Computational Method") + + DBI::dbExecute( + con, + sprintf( + "CREATE OR REPLACE VIEW amr_phenotype_tagged AS + SELECT + *, + CASE WHEN \"genome_drug.laboratory_typing_method\" IN (%s) THEN 1 ELSE 0 END AS is_lab_row, + CASE WHEN + (\"genome_drug.evidence\" = %s) + OR (COALESCE(\"genome_drug.computational_method\", '') <> '') + OR (\"genome_drug.laboratory_typing_method\" = 'Computational Prediction') + THEN 1 ELSE 0 END AS is_comp_row + FROM amr_phenotype", + lab_list_sql, comp_str_sql + ) + ) +} + +#' Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck +#' when malformed files can hang an FTPS connection by introducing safeguards +#' @keywords internal +.ftpes_download_one <- function(genomeID, out_dir, + connect_timeout = 10L, + max_time = 30L, + speed_time = 30L, # end if speed_time + speed_limit = 2048L, # B/s + min_bytes = 100L) { + dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + + exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") + for (ext in exts) { + dest <- file.path(out_dir, paste0(genomeID, ext)) + dest_tmp <- paste0(dest, ".tmp") + + # Skip if we already completed these + if (file.exists(dest) && file.info(dest)$size > min_bytes) next + if (file.exists(dest) && file.info(dest)$size == 0) try(unlink(dest), silent = TRUE) + if (file.exists(dest_tmp)) try(unlink(dest_tmp), silent = TRUE) + + url <- sprintf("ftp://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, ext) + + args <- c( + "--fail", "--silent", "--show-error", "-L", + "--connect-timeout", as.character(connect_timeout), + "--max-time", as.character(max_time), + "--speed-time", as.character(speed_time), + "--speed-limit", as.character(speed_limit), + "--ftp-ssl", # AUTH TLS on port 21 works in testing + "--ftp-pasv", "--disable-epsv", "--ipv4", + "--user", "anonymous:", + "-o", shQuote(dest_tmp), shQuote(url) + ) + + res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE)) + status <- attr(res, "status") + if (is.null(status)) status <- 0L + + # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL + ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes) + if (ok) { + if (!file.rename(dest_tmp, dest)) { + # If rename fails for some reason, copy + unlink + file.copy(dest_tmp, dest, overwrite = TRUE) + unlink(dest_tmp) + } + } else { + try(unlink(dest_tmp), silent = TRUE) + return(FALSE) # Abort early, don't accept partial set + } + } + + # Do we have all 3 present? + .is_complete_set(out_dir, genomeID, min_bytes = min_bytes) +} + +#' Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and +#' if that fails, trying a longer timeout 2nd pass at the end in case it was a +#' hiccup. If 2nd pass fails, log and give up on that file. +#' @keywords internal +.ftpes_download_two_pass <- function(genome_ids, out_dir, + workers_first = 4L, + workers_second = 4L, + log_file = NULL) { + genome_ids <- unique(as.character(genome_ids)) + if (!length(genome_ids)) { + return(character(0)) + } + + if (!is.null(log_file)) { + dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE) + cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)), + file = log_file, append = TRUE + ) + } + + # Pass 1: 30s per-file cap + message("FTPS pass 1 (45s timeout)") + future::plan(future::multisession, workers = max(1, workers_first)) + res1 <- future.apply::future_lapply( + genome_ids, + function(gid) { + ok <- .ftpes_download_one(gid, out_dir, + connect_timeout = 10L, max_time = 45L, + speed_time = 30L, speed_limit = 2048L + ) + list(gid = gid, ok = ok) + }, + future.seed = TRUE + ) + ok1 <- vapply(res1, `[[`, logical(1), "ok") + ok_ids_1 <- genome_ids[ok1] + fail_ids <- genome_ids[!ok1] + message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids))) + if (!is.null(log_file)) { + cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)), + file = log_file, append = TRUE + ) + } + + if (!length(fail_ids)) { + if (!is.null(log_file)) { + cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()), + file = log_file, append = TRUE + ) + } + return(ok_ids_1) + } + + # Pass 2: 60s per-file cap where we retry any failures + message("FTPS pass 2 (120s timeout) for failed genomes") + future::plan(future::multisession, workers = max(1, workers_second)) + res2 <- future.apply::future_lapply( + fail_ids, + function(gid) { + ok <- .ftpes_download_one(gid, out_dir, + connect_timeout = 10L, max_time = 120L, + speed_time = 30L, speed_limit = 2048L + ) + list(gid = gid, ok = ok) + }, + future.seed = TRUE + ) + ok2 <- vapply(res2, `[[`, logical(1), "ok") + ok_ids_2 <- fail_ids[ok2] + still_fail <- setdiff(fail_ids, ok_ids_2) + message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail))) + if (!is.null(log_file)) { + cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)), + file = log_file, append = TRUE + ) + if (length(still_fail)) { + cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n", + file = log_file, append = TRUE + ) + } + } + + unique(c(ok_ids_1, ok_ids_2)) +} + #' Fetch BV-BRC bacterial genome data #' #' Run the BV-BRC CLI (`p3-all-genomes`) inside a Docker image and return results as a tibble. @@ -26,7 +200,8 @@ "--in superkingdom,Bacteria ", "--eq genome_quality,Good ", "--in genome_status,WGS,Complete ", - "--attr genome_id,genome_name,taxon_id,species,strain,_version_,", + "--attr genome_id,genome_name,genome_quality,genome_status,", + "taxon_id,species,strain,_version_,", "collection_year,state_province,latitude,longitude,", "antimicrobial_resistance_evidence,assembly_accession,isolation_country,", "isolation_source,disease,host_common_name" @@ -44,10 +219,10 @@ stop("BV-BRC returned no data. The service may be unavailable.") } - # Parse + # Parse and coerce to character to avoid numeric interpretation losing trailing 0s df <- utils::read.table( text = raw_data, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" ) df <- tibble::as_tibble(df) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) @@ -230,7 +405,7 @@ ) for (user_bac in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(user_bac)))) { + if (.id_checker(user_bac)) { message("Numeric input detected: ", user_bac) if (user_bac %in% bvbrc_bacs$genome.taxon_id) { bac_name <- bvbrc_bacs$genome.species[bvbrc_bacs$genome.taxon_id == user_bac] @@ -294,10 +469,11 @@ return(cand) } if (query_type == "taxon_id") { - nums <- suppressWarnings(!is.na(as.numeric(user_bacs))) + nums <- .id_checker(user_bacs) if (!any(nums)) stop("Cannot infer taxon_id from user_bacs. Provide query_value.") return(as.character(user_bacs[which(nums)[1]])) } + stop("Unsupported query_type: ", query_type) } @@ -325,7 +501,7 @@ db_parts <- c() for (user_bac in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(user_bac)))) { + if (.id_checker(user_bac)) { db_parts <- c(db_parts, paste0("tid_", user_bac)) } else { parts <- stringr::str_split(user_bac, " ")[[1]] @@ -440,7 +616,7 @@ data_result <- tibble::as_tibble( utils::read.table( text = data_raw, sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" ) ) |> dplyr::mutate( @@ -465,17 +641,11 @@ #' Retrieve genome IDs for each taxon via BV-BRC and DuckDB #' -#' Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID, -#' and returns distinct genome IDs. Uses a per-selection DuckDB located under: -#' /data//.duckdb -#' BV-BRC column names are preserved. +#' Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then +#' returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data' +#' table to the per-selection DuckDB with a cache snapshot of core fields. #' -#' @param base_dir Character. Project root directory. Default ".". -#' @param user_bacs Character vector. Mixed inputs of taxon IDs and/or species names. -#' @param overwrite Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE. -#' @param verbose Logical. -#' -#' @return A numeric vector of distinct `genome.genome_id`, or NULL if none found. +#' @return A character vector of distinct `genome.genome_id`, or NULL if none found. .retrieveQueryIDs <- function(base_dir = ".", user_bacs, overwrite = FALSE, @@ -490,54 +660,63 @@ return(NULL) } - # Resolve per-bug DB path + # Per-selection DB path paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - bac_data <- tibble::tibble() - taxon_ids <- unique(bac_input_data$genome.taxon_id) - - if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.") - - for (i in seq_along(taxon_ids)) { - tax <- taxon_ids[i] - if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax) - - res <- .getGenomeIDs( - base_dir = base_dir, - query_type = "taxon_id", - query_value = as.character(tax), - user_bacs = user_bacs, - overwrite = TRUE, - verbose = verbose - ) - - con <- res$duckdbConnection - tbl <- res$table_name - each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) - bac_data <- dplyr::bind_rows(bac_data, each_bac_data) - - # Close per-iteration connection to avoid open handles piling up - try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE) + # Got a cache? Use that, it's fast + cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") + if (!file.exists(cache_db)) { + stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE) + on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) - if (nrow(bac_data) > 0) { - genome_ids <- bac_data |> - dplyr::distinct(`genome.genome_id`) |> - dplyr::pull(`genome.genome_id`) - genome_ids <- genome_ids[!is.na(genome_ids)] + taxon_ids <- unique(bac_input_data$genome.taxon_id) + if (isTRUE(verbose)) message("Querying cache for ", length(taxon_ids), " taxon IDs.") + + taxon_list_sql <- paste(DBI::dbQuoteString(con_cache, taxon_ids), collapse = ", ") + sql_ids <- sprintf( + "SELECT DISTINCT + \"genome.genome_id\" AS gid, + \"genome.genome_name\" AS genome_name, + \"genome.taxon_id\" AS taxon_id, + \"genome.species\" AS species, + \"genome.strain\" AS strain + FROM bvbrc_bac_data + WHERE \"genome.taxon_id\" IN (%s) + AND \"genome.genome_quality\" = 'Good' + AND \"genome.genome_status\" IN ('WGS','Complete')", + taxon_list_sql + ) + cache_rows <- DBI::dbGetQuery(con_cache, sql_ids) + valid_id_re <- "^[0-9]+\\.[0-9]+$" + cache_rows <- cache_rows[grepl(valid_id_re, cache_rows$gid), , drop = FALSE] - if (length(genome_ids) > 0) { - if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.") - return(genome_ids) - } else { - message("No valid genome IDs found.") - return(NULL) - } - } else { - message("No valid genome data found for any input.") + if (nrow(cache_rows) == 0L) { + message("No valid genome IDs found.") return(NULL) } + + genome_ids <- unique(cache_rows$gid) + if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs (cache).") + + # Write 'bac_data' from cache + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) + + bac_data_tbl <- data.frame( + `genome.genome_id` = cache_rows$gid, + `genome.genome_name` = cache_rows$genome_name, + `genome.taxon_id` = cache_rows$taxon_id, + `genome.species` = cache_rows$species, + `genome.strain` = cache_rows$strain, + stringsAsFactors = FALSE + ) + DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE) + if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path) + + genome_ids } #' Extract AMR Data Table @@ -688,7 +867,7 @@ writeLines(genome_ids_output, con = tmp_in) tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in)) - # Choose attributes (AMR for this pipeline) + # Choose attributes (AMR for this workflow) chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields get_args <- c( @@ -710,7 +889,6 @@ return(genome_data) } - #' Retrieve AMR or Microtrait metadata from BV-BRC and store in DuckDB #' #' Queries BV-BRC for AMR or microtrait metadata for genomes corresponding to user inputs. @@ -719,7 +897,7 @@ #' Tables written: #' - amr_phenotype #' - genome_data -#' - metadata (inner join on genome ID field names as returned by BV-BRC) +#' - metadata (join on genome IDs returned by BV-BRC) #' #' @param user_bacs Character vector. Mixed taxon IDs and/or species strings (used for naming). #' @param filter_type Character. "AMR" or "microTraits". Default "AMR". @@ -752,7 +930,7 @@ retrieveMetadata <- function(user_bacs, return(NULL) } - # Fields (unchanged BV-BRC names) + # Desired fields from BV-BRC drug_fields <- paste0( "antibiotic,computational_method,", "evidence,genome_name,id,", @@ -802,7 +980,7 @@ retrieveMetadata <- function(user_bacs, "trna" ) - # Batching + # Batching downloads and parallel implementation batch_size <- 500L genome_ids <- as.character(genome_ids) genome_batches <- split(genome_ids, ceiling(seq_along(genome_ids) / batch_size)) @@ -810,6 +988,7 @@ retrieveMetadata <- function(user_bacs, n_cores <- max(1L, parallel::detectCores(logical = TRUE) - 1L) param <- BiocParallel::SnowParam(workers = n_cores) + # Pull AMR metadata if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.") batch_drug_data <- BiocParallel::bplapply(genome_batches, function(batch) { .extractAMRtable( @@ -826,17 +1005,15 @@ retrieveMetadata <- function(user_bacs, message("No drug data returned.") return(NULL) } - - combined_drug_data_tbl <- tibble::as_tibble( - utils::read.table( - text = combined_drug_data, - sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" - ) - ) |> + combined_drug_data_tbl <- tibble::as_tibble(utils::read.table( + text = combined_drug_data, + sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" + )) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |> dplyr::mutate(`genome_drug.genome_id` = as.character(`genome_drug.genome_id`)) + # Pull genome metadata if (isTRUE(verbose)) message("Retrieving genome metadata in batches.") batch_genome_data <- BiocParallel::bplapply(genome_batches, function(batch) { .extractGenomeData( @@ -854,21 +1031,17 @@ retrieveMetadata <- function(user_bacs, message("No genome data returned.") return(NULL) } - - combined_genome_data_tbl <- tibble::as_tibble( - utils::read.table( - text = combined_genome_data, - sep = "\t", header = TRUE, fill = TRUE, - quote = "", check.names = FALSE, comment.char = "" - ) - ) |> + combined_genome_data_tbl <- tibble::as_tibble(utils::read.table( + text = combined_genome_data, + sep = "\t", header = TRUE, fill = TRUE, + quote = "", check.names = FALSE, comment.char = "", colClasses = "character" + )) |> dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |> dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`)) - # Per-bug DB path + # Write & join paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite) db_path <- paths$db_path - logs_dir <- file.path(base_dir, "data", "logs") dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path), @@ -876,27 +1049,69 @@ retrieveMetadata <- function(user_bacs, ) con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE) + DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE) DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE) if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.") - initial_metadata <- tibble::as_tibble(DBI::dbGetQuery( + DBI::dbExecute(con, ' + CREATE OR REPLACE TABLE metadata AS + SELECT * + FROM amr_phenotype + INNER JOIN genome_data + ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id" + ') + + # Debug summary after writes + n_targets <- length(genome_ids) + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n + if (isTRUE(verbose)) { + message( + "Initial summary: targets=", n_targets, + " | AMR genomes=", n_amr_ids, + " | genome_data genomes=", n_gmeta_ids + ) + } + + # Tagged view + .create_amr_tagged_view(con) + + # Final debug summary + n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) { + DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n + } else { + NA_integer_ + } + n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n + n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n + + ids_zero_amr <- DBI::dbGetQuery( con, - 'SELECT * - FROM amr_phenotype - INNER JOIN genome_data - ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"' - )) - DBI::dbWriteTable(con, "metadata", initial_metadata, overwrite = TRUE) + 'WITH sel AS (SELECT DISTINCT "genome.genome_id" AS gid FROM genome_data), + got AS (SELECT DISTINCT "genome_drug.genome_id" AS gid FROM amr_phenotype) + SELECT sel.gid + FROM sel LEFT JOIN got USING (gid) + WHERE got.gid IS NULL + ORDER BY sel.gid' + )$gid if (isTRUE(verbose)) { - message("Wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to: ", db_path) + message("Final summary:") + message(" targets : ", length(genome_ids)) + message(" bac_data : ", n_bac) + message(" genome_data : ", n_gmeta_ids) + message(" amr_phenotype: ", n_amr_ids) + message(" genomes with 0 AMR rows: ", length(ids_zero_amr)) + if (length(ids_zero_amr)) { + message(" e.g.: ", paste(utils::head(ids_zero_amr, 10), collapse = ", ")) + } } list(duckdbConnection = con, table_name = "metadata") } - # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads .strip_fasta_preamble <- function(fna_path) { if (!file.exists(fna_path)) { @@ -954,37 +1169,75 @@ retrieveMetadata <- function(user_bacs, #' derive genome IDs from user_bacs (taxon IDs or species substring), and #' write a minimal "filtered" table (without AMR evidence filtering). #' +#' @param evidence_mode Character. Either "lab_only" (default), "lab_or_comp" (all), +#' "comp_only" (BV-BRC-predicted without lab labels), or "any" (no AMR data required). +#' @return A list with a DuckDB connection and table_name = "filtered" +#' Filter genomes by AMR phenotype and metadata, and store results in DuckDB +#' +#' Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and +#' apply evidence & genome_quality filters. +#' +#' Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE, +#' read BV-BRC cache at /data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"), +#' derive genome IDs from user_bacs (taxon IDs or species substring), and +#' write a minimal "filtered" table (without AMR evidence filtering). +#' +#' @param evidence_mode Character. One of: +#' "lab_only" (default) -> only laboratory evidence +#' "lab_or_comp" -> laboratory OR computational evidence +#' "comp_only" -> only computational evidence +#' "any" -> no AMR required (from genome_data; Good only) #' @return A list with a DuckDB connection and table_name = "filtered" .filterGenomes <- function(user_bacs, base_dir = ".", + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE, fallback_to_bvbrc_cache = TRUE) { + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) db_path <- paths$db_path con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) - on.exit( { - # Keep connection open for caller NULL }, add = TRUE - ) + ) # keep open for caller - # Good metadata present? Apply AMR filters + # The convenient "Metadata Exists" path if ("metadata" %in% DBI::dbListTables(con)) { if (isTRUE(verbose)) message("Loading metadata for filtering.") - initial_metadata <- DBI::dbReadTable(con, "metadata") - if (is.null(initial_metadata) || nrow(initial_metadata) == 0) { + + # If "any", AMR data not needed, just fetch qualifying genome_data + if (evidence_mode == "any") { + gd <- DBI::dbReadTable(con, "genome_data") + if (is.null(gd) || nrow(gd) == 0) { + DBI::dbDisconnect(con, shutdown = TRUE) + stop("No data available in 'genome_data'.") + } + gd <- tibble::as_tibble(gd) |> + dplyr::filter(`genome.genome_quality` == "Good") |> + dplyr::distinct(`genome.genome_id`) + # Keep BV-BRC column name + DBI::dbWriteTable(con, "filtered", gd, overwrite = TRUE) + if (isTRUE(verbose)) { + message("Post-filter distinct genomes (any): ", nrow(gd)) + message("Wrote table 'filtered' to: ", db_path) + } + return(list(duckdbConnection = con, table_name = "filtered")) + } + + # Otherwise, open up the metadata and start interrogating the AMR data + md <- DBI::dbReadTable(con, "metadata") + if (is.null(md) || nrow(md) == 0) { DBI::dbDisconnect(con, shutdown = TRUE) message("No data available in 'metadata'.") return(NULL) } - # Normalize evidence labels - initial_metadata <- tibble::as_tibble(initial_metadata) |> + md <- tibble::as_tibble(md) |> dplyr::mutate( `genome_drug.evidence` = dplyr::case_when( `genome_drug.laboratory_typing_method` %in% @@ -994,29 +1247,39 @@ retrieveMetadata <- function(user_bacs, ) ) - # AMR and quality filtering - filtered_metadata <- initial_metadata |> - dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |> + if (evidence_mode == "lab_only") { + md <- dplyr::filter(md, `genome_drug.evidence` == "Laboratory Method") + } else if (evidence_mode == "comp_only") { + md <- dplyr::filter( + md, + `genome_drug.evidence` == "Computational Method" | + (!is.na(`genome_drug.computational_method`) & nzchar(`genome_drug.computational_method`)) | + (`genome_drug.laboratory_typing_method` == "Computational Prediction") + ) + } else { + # Lab_or_comp: Doesn't matter, keep any predictions + md <- md + } + + md <- md |> dplyr::filter(`genome.genome_quality` == "Good") |> - dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) + dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) |> + dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name - DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE) + DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE) if (isTRUE(verbose)) { - message("Post-filter rows: ", nrow(filtered_metadata)) + message("Post-filter distinct genomes: ", nrow(md)) message("Wrote table 'filtered' to: ", db_path) } return(list(duckdbConnection = con, table_name = "filtered")) } - # Attempt BV-BRC cache location + # No metadata fallback if (!isTRUE(fallback_to_bvbrc_cache)) { DBI::dbDisconnect(con, shutdown = TRUE) stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.") } - - if (isTRUE(verbose)) { - message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") - } + if (isTRUE(verbose)) message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.") cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb") if (!file.exists(cache_db)) { @@ -1024,7 +1287,7 @@ retrieveMetadata <- function(user_bacs, stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.") } - con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db) + con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE) on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE) if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) { @@ -1033,30 +1296,22 @@ retrieveMetadata <- function(user_bacs, } bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data")) - - - # Derive matches from user_bacs (taxon IDs or species) sel <- tibble::tibble(`genome.genome_id` = character()) for (v in user_bacs) { - if (suppressWarnings(!is.na(as.numeric(v)))) { - # numeric taxon_id match - matches <- bv[bv$`genome.taxon_id` == v | - bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE] + if (.id_checker(v)) { + v_chr <- as.character(v) + matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE] } else { - # species substring (case-insensitive) matches <- bv[stringr::str_detect( bv$`genome.species`, stringr::fixed(v, ignore_case = TRUE) ), , drop = FALSE] } - if (nrow(matches)) { sel <- dplyr::bind_rows( sel, - tibble::tibble( - `genome.genome_id` = as.character(matches$`genome.genome_id`) - ) + tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`)) ) } } @@ -1067,37 +1322,14 @@ retrieveMetadata <- function(user_bacs, stop("No genomes matched user_bacs in BV-BRC cache.") } - # Minimal 'filtered' for downstream steps (downloaders & genomeList) - minimal_filtered <- sel - # Ensure expected column name used downstream: - # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist. - DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE) - - if (isTRUE(verbose)) { - message("Wrote table 'filtered' to: ", db_path) - } + DBI::dbWriteTable(con, "filtered", sel, overwrite = TRUE) + if (isTRUE(verbose)) message("Wrote table 'filtered' to: ", db_path) list(duckdbConnection = con, table_name = "filtered") } - -# BV-BRC downloader block -# Normalize Docker path -.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE)) - -# Prefer bash -.pick_shell <- function(image) { - chk <- suppressWarnings(system2("docker", - c( - "run", "--rm", image, "sh", "-lc", - "command -v bash >/dev/null || echo NOBASH" - ), - stdout = TRUE, stderr = TRUE - )) - if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" -} - -# Check if a complete set (.fna + .PATRIC.faa + .PATRIC.gff) exists after DL +#' Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff) +#' @keywords internal .is_complete_set <- function(dir, genomeID, min_bytes = 100) { fna <- file.path(dir, paste0(genomeID, ".fna")) faa <- file.path(dir, paste0(genomeID, ".PATRIC.faa")) @@ -1107,11 +1339,14 @@ retrieveMetadata <- function(user_bacs, all(purrr::map_dbl(paths, function(x) file.info(x)$size) > min_bytes) } -# List genomes already completed +#' Helps collate completed genomes into a set +#' @keywords internal .list_complete <- function(dir, genome_ids, min_bytes = 100) { genome_ids[purrr::map_lgl(genome_ids, .is_complete_set, dir = dir, min_bytes = min_bytes)] } +#' Helps in auditing downloaded files to ensure everything's complete per ID +#' @keywords internal # Any genomes missing bits or pieces? Find em .missing_any <- function(dir, genome_ids, min_bytes = 100) { genome_ids[!purrr::map_lgl(genome_ids, .is_complete_set, dir = dir, min_bytes = min_bytes)] @@ -1136,68 +1371,27 @@ retrieveMetadata <- function(user_bacs, df } -# FTPS preferred! HTTPS as a fallback to see if it's just an FTPS issue -.ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) { - files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff") - dests <- file.path(out_dir, paste0(genomeID, files)) - if (.is_complete_set(out_dir, genomeID, min_bytes)) { - return(TRUE) - } - - get_one <- function(url, dest) { - if (nzchar(Sys.which("wget"))) { - res <- suppressWarnings(system2("wget", - c("-q", "-O", shQuote(dest), shQuote(url)), - stdout = TRUE, stderr = TRUE - )) - st <- attr(res, "status") - if (is.null(st)) st <- 0L - return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) - } else if (nzchar(Sys.which("curl"))) { - curl_args <- if (startsWith(url, "ftps://")) { - c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url)) - } else { - c("-L", "-o", shQuote(dest), shQuote(url)) - } - res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE)) - st <- attr(res, "status") - if (is.null(st)) st <- 0L - return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes) - } - FALSE - } +### BV-BRC CLI downloader [slower by comparison, but does not need FTP server] - for (ext_i in seq_along(files)) { - dest <- dests[ext_i] - if (file.exists(dest) && file.info(dest)$size > min_bytes) next - ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) - https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i]) - - ok <- FALSE - for (k in 1:tries) { - if (get_one(ftps, dest)) { - ok <- TRUE - break - } - } - if (!ok) { - for (k in 1:2) { - if (get_one(https, dest)) { - ok <- TRUE - break - } - } - } - if (!ok) { - return(FALSE) - } - } - .is_complete_set(out_dir, genomeID, min_bytes) -} +#' Helps normalize Docker paths +#' @keywords internal +.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE)) -# BV-BRC CLI downloader [very slow by comparison, but does not need FTP server] +#' Helps run a shell inside a container, and prefers bash (don't we all?) +#' @keywords internal +.pick_shell <- function(image) { + chk <- suppressWarnings(system2("docker", + c( + "run", "--rm", image, "sh", "-lc", + "command -v bash >/dev/null || echo NOBASH" + ), + stdout = TRUE, stderr = TRUE + )) + if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash" +} -# p3-dump-genomes to fetch FASTA and .gto files +#' Using p3-dump-genomes in CLI to fetch FASTA and .gto files +#' @keywords internal .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { out_dir <- normalizePath(out_dir, mustWork = FALSE) dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) @@ -1242,7 +1436,8 @@ retrieveMetadata <- function(user_bacs, st == 0L } -# Export GFF from GTO per genomes in each chunk +#' Exports GFF files from the downloaded GTO per genome in each chunk +#' @keywords internal .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) { out_dir <- normalizePath(out_dir, mustWork = FALSE) ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt")) @@ -1333,36 +1528,48 @@ retrieveGenomes <- function(base_dir = ".", method = c("ftp", "cli"), image = "danylmb/bvbrc:5.3", skip_existing = TRUE, - ftp_workers = 8L, + ftp_workers = 4L, cli_fasta_workers = 4L, cli_gff_workers = 4L, chunk_size = 50L, + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), # NEW verbose = TRUE) { method <- match.arg(method) + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - # IDs from .filterGenomes() - if (isTRUE(verbose)) message("Filtering genomes before download.") - f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose) - if (is.null(f_out)) { - return(character()) + # Use 'filtered' if already prepared, or start filtering + if (isTRUE(verbose)) message("Preparing download set (checking for existing 'filtered').") + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + db_path <- paths$db_path + con0 <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path) + has_filtered <- "filtered" %in% DBI::dbListTables(con0) + if (has_filtered) { + if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).") + con <- con0 + tbl <- "filtered" + } else { + if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.") + f_out <- .filterGenomes( + base_dir = base_dir, + user_bacs = user_bacs, + evidence_mode = evidence_mode, + verbose = verbose + ) + con <- f_out$duckdbConnection + tbl <- f_out$table_name } - con <- f_out$duckdbConnection - tbl <- f_out$table_name ids <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) |> dplyr::distinct(`genome.genome_id`) |> dplyr::pull(`genome.genome_id`) - # Set up the paths and such - paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) - bug_dir <- dirname(paths$db_path) + bug_dir <- dirname(db_path) genome_path <- file.path(bug_dir, "genomes") logs_dir <- file.path(base_dir, "data", "logs") dir.create(genome_path, recursive = TRUE, showWarnings = FALSE) dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE) - # Adding support for resuming a download if (isTRUE(skip_existing)) { already <- .list_complete(genome_path, ids) if (isTRUE(verbose)) message(length(already), " genomes already completed; skipping.") @@ -1380,7 +1587,6 @@ retrieveGenomes <- function(base_dir = ".", return(all_complete) } - # FTP method -- preferred if server is available if (identical(method, "ftp")) { if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers) ftp_param <- BiocParallel::SnowParam(workers = max(1L, ftp_workers)) @@ -1534,6 +1740,9 @@ genomeList <- function(base_dir = ".", #' `"ftp"` (default) or `"cli"`. #' @param overwrite Logical. Passed to metadata filtering and DuckDB creation. #' Default FALSE. +#' @param evidence_mode Character. Sets what types of AMR evidence is acceptable. +#' Default `lab_only`. `any` will not require AMR data for downloads. This will +#' return very large download lists for many species! #' @param verbose Logical. Print progress messages. Default TRUE. #' #' @return A list (the output of `genomeList()`), containing: @@ -1545,31 +1754,66 @@ prepareGenomes <- function(user_bacs, base_dir = ".", method = c("ftp", "cli"), overwrite = FALSE, + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE) { method <- match.arg(method) + evidence_mode <- match.arg(evidence_mode) base_dir <- normalizePath(base_dir, mustWork = FALSE) - # Ensure the BV-BRC metadata cache exists, fetch if missing .ensure_bvbrc_cache(base_dir = base_dir, verbose = verbose) - if (isTRUE(verbose)) message("Step 1: Downloading genomes from BV-BRC") + if (isTRUE(verbose)) message("Step 0: Building AMR metadata (retrieveMetadata)") + invisible(retrieveMetadata( + user_bacs = user_bacs, + filter_type = "AMR", + base_dir = base_dir, + abx = "All", + overwrite = overwrite, + verbose = verbose + )) + if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode) + f_out <- .filterGenomes( + base_dir = base_dir, + user_bacs = user_bacs, + evidence_mode = evidence_mode, + verbose = verbose, + fallback_to_bvbrc_cache = FALSE + ) + if (is.null(f_out)) { + message("No genomes available after evidence filtering.") + return(NULL) + } + + # A little summary of what's left after filtering (or not) + paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs) + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE) + n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n + n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA + n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA + DBI::dbDisconnect(con, shutdown = TRUE) + if (isTRUE(verbose)) { + message(sprintf( + "Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s", + n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta) + )) + } + + if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")") ids <- retrieveGenomes( base_dir = base_dir, user_bacs = user_bacs, method = method, skip_existing = !overwrite, + evidence_mode = evidence_mode, verbose = verbose ) - if (length(ids) == 0L) { - message("No genomes available after filtering.") + message("No genomes downloaded.") return(NULL) } - if (isTRUE(verbose)) message("Step 2: Formatting data into a database for further processing") - - # List local files into per-selection DB + if (isTRUE(verbose)) message("Step 3: Formatting data into a database for further processing") out <- genomeList( base_dir = base_dir, user_bacs = user_bacs, @@ -1577,7 +1821,7 @@ prepareGenomes <- function(user_bacs, ) if (isTRUE(verbose)) { - message("Done. Files are ready! Please continue to downstream processing by `runDataProcessing()`") + message("Done. Files are ready! Continue with downstream processing with runDataProcessing().") } out } diff --git a/R/data_processing.R b/R/data_processing.R index 512fdd4..17b7682 100644 --- a/R/data_processing.R +++ b/R/data_processing.R @@ -1,3 +1,6 @@ +#' @importFrom data.table := +NULL + #' Normalize a host filesystem path for use in Docker #' #' Converts Windows and mixed-separator paths to forward slashes @@ -405,12 +408,13 @@ names_to = "genome_ids", values_to = "protein_ids" ) |> - dplyr::mutate(genome_ids = gsub(".PATRIC", "", genome_ids)) |> + dplyr::mutate(genome_ids = sub("\\.PATRIC\\.\\.\\..*$", "", genome_ids)) |> dplyr::select(genome_ids, Gene, protein_ids) |> dplyr::distinct() |> dplyr::filter(!is.na(protein_ids)) |> tidyr::separate_rows(protein_ids, sep = ";") |> - dplyr::filter(!stringr::str_detect(protein_ids, "_pseudo")) |> + # dplyr::filter(!stringr::str_detect(protein_ids, "_pseudo")) |> + dplyr::mutate(protein_ids = gsub("_pseudo", "", protein_ids)) |> DBI::dbWriteTable(conn = con, name = "genome_gene_protein", overwrite = TRUE) } @@ -730,7 +734,7 @@ runPanaroo2Duckdb <- function(duckdb_path, # This finds the reference cluster ID and names the cluster with it ref_line <- grep("\\*$", cluster_lines, value = TRUE) ref_id <- if (length(ref_line) > 0) { - stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+") + stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+") } else { paste0("Cluster_", i - 1) } @@ -738,7 +742,7 @@ runPanaroo2Duckdb <- function(duckdb_path, # Pull genome IDs genome_matches <- stringr::str_match( cluster_lines, - "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+" + "fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+" )[, 2] genome_matches <- genome_matches[!is.na(genome_matches)] @@ -753,6 +757,58 @@ runPanaroo2Duckdb <- function(duckdb_path, cluster_map } +#' Parse CD-HIT `.clstr` output into a long-format mapping +#' +#' Reads a CD-HIT `.clstr` file and constructs a mapping of clusters to member feature ids. +#' +#' @param clustered_faa Base path to CD-HIT output (without `.clstr` extension). +#' +#' @return A tibble with columns `cluster` and `member`. +#' +#' @keywords internal +.extractMembersInClusters <- function(clustered_faa) { + clstr <- paste0(clustered_faa, ".clstr") + if (!file.exists(clstr)) { + stop( + "CD-HIT cluster file not found: ", clstr, + "\nEnsure .runCDHIT() completed successfully and produced the .clstr file." + ) + } + + lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1 + cluster_ids <- grep("^>Cluster", lines) + cluster_member <- data.table::data.table() + + for (i in seq_along(cluster_ids)) { + start <- cluster_ids[i] + 1 + end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines) + cluster_lines <- lines[start:end] + + # This finds the reference cluster ID and names the cluster with it + ref_line <- grep("\\*$", cluster_lines, value = TRUE) + ref_id <- if (length(ref_line) > 0) { + stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+") + } else { + paste0("Cluster_", i - 1) + } + + # Pull genome IDs + members <- stringr::str_match( + cluster_lines, + "fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+" + )[, 1] + members <- members[!is.na(members)] + + if (length(members) > 0) { + cluster_member <- data.table::rbindlist(list( + cluster_member, + data.table::data.table(cluster = ref_id, member = members) + ), use.names = TRUE) + } + } + + tibble::as_tibble(cluster_member) +} #' Build genome-by-protein-cluster count matrix #' @@ -796,8 +852,8 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map) names_faa <- names(cdhit_output_faa) |> tibble::as_tibble() |> dplyr::mutate( - proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), - locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2], + proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"), + locus_tag = stringr::str_match(value, "peg(?:sc)?\\.[0-9]+\\|([^\\s]+)")[, 2], proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2]) ) |> dplyr::select(-value) @@ -846,11 +902,15 @@ CDHIT2duckdb <- function(duckdb_path, clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa) DBI::dbWriteTable(con, "protein_cluster_seq", tibble::tibble( - name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"), + name = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"), sequence = as.character(clustered_faa) ), overwrite = TRUE ) + + cluster_member <- .extractMembersInClusters(cdhit_outputs$clustered_faa) + DBI::dbWriteTable(con, "protein_members", cluster_member, overwrite = TRUE) + invisible(TRUE) } @@ -1226,7 +1286,16 @@ domainFromIPR <- function(duckdb_path, } # Clean BV-BRC metadata, then save as Parquet files -cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { +#' +#' @param duckdb_path +#' @param path +#' @param ref_file_path +#' +#' @returns +#' @export +#' +#' @examples +cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/") { duckdb_path <- normalizePath(duckdb_path) # If no explicit path is provided (or a generic one), choose results// when # the DuckDB lives under data//, or else fall back to the DuckDB directory. @@ -1258,8 +1327,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::tbl(con, "filtered") |> tibble::as_tibble() |> + dplyr::select("genome.genome_id") |> + dplyr::left_join(dplyr::tbl(con, "metadata") |> + tibble::as_tibble(), by = dplyr::join_by("genome.genome_id" == "genome_drug.genome_id")) |> dplyr::select( - "genome_drug.genome_id", "genome_drug.antibiotic", + "genome.genome_id", "genome_drug.antibiotic", "genome_drug.genome_name", "genome_drug.laboratory_typing_method", "genome_drug.resistant_phenotype", "genome_drug.taxon_id", "genome_drug.pmid", "genome.collection_year", @@ -1271,19 +1343,23 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::left_join(drug_class, by = c("cleaned_drug" = "drug")) |> dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |> dplyr::left_join(class_abbr, by = "drug_class") |> - DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE) + dplyr::filter(genome_drug.resistant_phenotype %in% c("Resistant", "Susceptible")) |> + DBI::dbWriteTable(conn = con, name = "filtered_metadata", overwrite = TRUE) - resistance_summary <- dplyr::tbl(con, "filtered") |> - tibble::as_tibble() |> + resistance_summary <- DBI::dbReadTable(con, "filtered_metadata") |> dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |> - dplyr::group_by(genome_drug.genome_id) |> + dplyr::group_by(genome.genome_id) |> dplyr::summarise( - num_resistant_classes = dplyr::n_distinct(drug_class), - resistant_classes = paste(unique(class_abbr), collapse = "_") + resistant_classes = paste(sort(unique(class_abbr)), collapse = "_"), + .groups = "drop" + ) |> + dplyr::collect() |> + dplyr::mutate( + num_resistant_classes = stringr::str_count(resistant_classes, "_") + 1 ) - year_breaks <- seq(1980, 2023, by = 5) - dplyr::tbl(con, "filtered") |> + year_breaks <- seq(1980, 2026, by = 5) + dplyr::tbl(con, "filtered_metadata") |> tibble::as_tibble() |> dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |> dplyr::select(-cleaned_drug) |> @@ -1291,7 +1367,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { dplyr::rename("cleaned_country" = "clean_name", "country_abbr" = "short_name") |> dplyr::mutate(genome.isolation_country = cleaned_country) |> dplyr::select(-cleaned_country) |> - dplyr::left_join(resistance_summary, by = "genome_drug.genome_id") |> + dplyr::left_join(resistance_summary, by = "genome.genome_id") |> dplyr::mutate(resistant_classes = dplyr::case_when( is.na(resistant_classes) ~ genome_drug.resistant_phenotype, TRUE ~ resistant_classes @@ -1311,6 +1387,80 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { )) |> DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE) + # Parquet output path + metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' + + # Also export AMR/genome/original metadata + amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") + genome_data_parquet <- file.path(path, "genome_data.parquet") + original_metadata_parquet <- file.path(path, "original_metadata.parquet") + + writeCompressedParquet <- function(df, path) { + arrow::write_parquet( + df, + path, + compression = "zstd", + compression_level = 9, + use_dictionary = TRUE + ) + } + + db_name <- duckdb_path |> + stringr::str_split_i(".duckdb", i = 1) |> + paste0("_parquet.duckdb") + con_new <- DBI::dbConnect(duckdb::duckdb(), db_name) + on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE) + + # Views below reference parquet files by bare filename. Point DuckDB at the + # parquet directory so schema inference at CREATE VIEW time can resolve them. + DBI::dbExecute(con_new, sprintf("SET file_search_path='%s'", path)) + + # cleaned_metadata -> parquet + view (as metadata) + DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", basename(metadata_parquet))) + + # debug/complete views: amr_phenotype, genome_data, original_metadata + DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) + DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) + DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) + + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", basename(amr_phenotype_parquet))) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", basename(genome_data_parquet))) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", basename(original_metadata_parquet))) + + invisible(TRUE) +} + +# Clean feature matrices, then save as Parquet files +#' +#' @param duckdb_path +#' @param path +#' +#' @returns +#' @export +#' +#' @examples +cleanData <- function(duckdb_path, path) { + duckdb_path <- normalizePath(duckdb_path) + # If no explicit path is provided (or a generic one), choose results// when + # the DuckDB lives under data//, or else fall back to the DuckDB directory. + if (missing(path) || path %in% c(".", "results", "results/")) { + bug_dir <- dirname(duckdb_path) + mapped_results <- sub( + paste0(.Platform$file.sep, "data", .Platform$file.sep), + paste0(.Platform$file.sep, "results", .Platform$file.sep), + bug_dir, + fixed = TRUE + ) + path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir + } + + path <- normalizePath(path, mustWork = FALSE) + if (!dir.exists(path)) dir.create(path, recursive = TRUE) + + con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path) + on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE) + # Parquet output paths genes_parquet <- file.path(path, "gene_count.parquet") gene_names_parquet <- file.path(path, "gene_names.parquet") @@ -1321,17 +1471,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { proteins_parquet <- file.path(path, "protein_count.parquet") domains_parquet <- file.path(path, "domain_count.parquet") - metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata' - domain_names_parquet <- file.path(path, "domain_names.parquet") protein_names_parquet <- file.path(path, "protein_names.parquet") protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet") - - # Also export AMR/genome/original metadata - amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet") - genome_data_parquet <- file.path(path, "genome_data.parquet") - original_metadata_parquet <- file.path(path, "original_metadata.parquet") + protein_cluster_member_parquet <- file.path(path, "protein_members.parquet") writeCompressedParquet <- function(df, path) { arrow::write_parquet( @@ -1385,10 +1529,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { writeCompressedParquet(struct_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", basename(struct_parquet))) - # cleaned_metadata -> parquet + view (as metadata) - DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", basename(metadata_parquet))) - # names/seq tables -> parquet + views DBI::dbReadTable(con, "gene_names") |> writeCompressedParquet(gene_names_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", basename(gene_names_parquet))) @@ -1409,18 +1549,12 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") { DBI::dbReadTable(con, "protein_cluster_seq") |> writeCompressedParquet(protein_cluster_seq_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')", basename(protein_cluster_seq_parquet))) + DBI::dbReadTable(con, "protein_members") |> writeCompressedParquet(protein_cluster_member_parquet) + DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_members AS SELECT * FROM read_parquet('%s')", protein_cluster_member_parquet)) + DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet) DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", basename(genome_gene_protein_parquet))) - # debug/complete views: amr_phenotype, genome_data, original_metadata - DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet) - DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet) - DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet) - - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", basename(amr_phenotype_parquet))) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", basename(genome_data_parquet))) - DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", basename(original_metadata_parquet))) - invisible(TRUE) } @@ -1641,7 +1775,8 @@ runDataProcessing <- function(duckdb_path, stop("`ref_file_path` (directory with reference TSVs) must be provided to cleanData().") } if (isTRUE(verbose)) message("Cleaning metadata and exporting Parquet-backed views.") - cleanData(duckdb_path = duckdb_path, path = out_dir, ref_file_path = ref_file_path) + cleanMetaData(duckdb_path = duckdb_path, path = out_dir, ref_file_path = ref_file_path) + cleanData(duckdb_path = duckdb_path, path = out_dir) parquet_duckdb_path <- paste0( stringr::str_split_i(duckdb_path, ".duckdb", i = 1), diff --git a/man/dot-audit_gaps.Rd b/man/dot-audit_gaps.Rd new file mode 100644 index 0000000..86009c0 --- /dev/null +++ b/man/dot-audit_gaps.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.audit_gaps} +\alias{.audit_gaps} +\title{Helps in auditing downloaded files to ensure everything's complete per ID} +\usage{ +.audit_gaps(out_dir, ids, min_bytes = 100) +} +\description{ +Helps in auditing downloaded files to ensure everything's complete per ID +} +\keyword{internal} diff --git a/man/dot-cli_dump_fastas_gto_chunk.Rd b/man/dot-cli_dump_fastas_gto_chunk.Rd new file mode 100644 index 0000000..5d9327c --- /dev/null +++ b/man/dot-cli_dump_fastas_gto_chunk.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.cli_dump_fastas_gto_chunk} +\alias{.cli_dump_fastas_gto_chunk} +\title{Using p3-dump-genomes in CLI to fetch FASTA and .gto files} +\usage{ +.cli_dump_fastas_gto_chunk(image, out_dir, genome_ids, tag, tries = 3L) +} +\description{ +Using p3-dump-genomes in CLI to fetch FASTA and .gto files +} +\keyword{internal} diff --git a/man/dot-cli_export_gff_chunk.Rd b/man/dot-cli_export_gff_chunk.Rd new file mode 100644 index 0000000..061da41 --- /dev/null +++ b/man/dot-cli_export_gff_chunk.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.cli_export_gff_chunk} +\alias{.cli_export_gff_chunk} +\title{Exports GFF files from the downloaded GTO per genome in each chunk} +\usage{ +.cli_export_gff_chunk(image, out_dir, genome_ids, tag, tries = 3L) +} +\description{ +Exports GFF files from the downloaded GTO per genome in each chunk +} +\keyword{internal} diff --git a/man/dot-create_amr_tagged_view.Rd b/man/dot-create_amr_tagged_view.Rd new file mode 100644 index 0000000..442c3bf --- /dev/null +++ b/man/dot-create_amr_tagged_view.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.create_amr_tagged_view} +\alias{.create_amr_tagged_view} +\title{Helps tag genomes with their AMR evidence for parsing} +\usage{ +.create_amr_tagged_view(con) +} +\description{ +Helps tag genomes with their AMR evidence for parsing +} +\keyword{internal} diff --git a/man/dot-docker_path.Rd b/man/dot-docker_path.Rd index 8ea1e5a..d0b6f70 100644 --- a/man/dot-docker_path.Rd +++ b/man/dot-docker_path.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_processing.R +% Please edit documentation in R/data_curation.R, R/data_processing.R \name{.docker_path} \alias{.docker_path} -\title{Normalize a host filesystem path for use in Docker} +\title{Helps normalize Docker paths} \usage{ +.docker_path(p) + .docker_path(p) } \arguments{ diff --git a/man/dot-extractMembersInClusters.Rd b/man/dot-extractMembersInClusters.Rd new file mode 100644 index 0000000..e45a8db --- /dev/null +++ b/man/dot-extractMembersInClusters.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_processing.R +\name{.extractMembersInClusters} +\alias{.extractMembersInClusters} +\title{Parse CD-HIT \code{.clstr} output into a long-format mapping} +\usage{ +.extractMembersInClusters(clustered_faa) +} +\arguments{ +\item{clustered_faa}{Base path to CD-HIT output (without \code{.clstr} extension).} +} +\value{ +A tibble with columns \code{cluster} and \code{member}. +} +\description{ +Reads a CD-HIT \code{.clstr} file and constructs a mapping of clusters to member feature ids. +} +\keyword{internal} diff --git a/man/dot-filterGenomes.Rd b/man/dot-filterGenomes.Rd index d6871df..eb31db4 100644 --- a/man/dot-filterGenomes.Rd +++ b/man/dot-filterGenomes.Rd @@ -7,11 +7,30 @@ .filterGenomes( user_bacs, base_dir = ".", + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE, fallback_to_bvbrc_cache = TRUE ) } +\arguments{ +\item{evidence_mode}{Character. One of: +"lab_only" (default) -> only laboratory evidence +"lab_or_comp" -> laboratory OR computational evidence +"comp_only" -> only computational evidence +"any" -> no AMR required (from genome_data; Good only)} +} \value{ +A list with a DuckDB connection and table_name = "filtered" +Filter genomes by AMR phenotype and metadata, and store results in DuckDB + +Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and +apply evidence & genome_quality filters. + +Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE, +read BV-BRC cache at /data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"), +derive genome IDs from user_bacs (taxon IDs or species substring), and +write a minimal "filtered" table (without AMR evidence filtering). + A list with a DuckDB connection and table_name = "filtered" } \description{ diff --git a/man/dot-ftpes_download_one.Rd b/man/dot-ftpes_download_one.Rd new file mode 100644 index 0000000..b89f5d4 --- /dev/null +++ b/man/dot-ftpes_download_one.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.ftpes_download_one} +\alias{.ftpes_download_one} +\title{Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck +when malformed files can hang an FTPS connection by introducing safeguards} +\usage{ +.ftpes_download_one( + genomeID, + out_dir, + connect_timeout = 10L, + max_time = 30L, + speed_time = 30L, + speed_limit = 2048L, + min_bytes = 100L +) +} +\description{ +Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck +when malformed files can hang an FTPS connection by introducing safeguards +} +\keyword{internal} diff --git a/man/dot-ftpes_download_two_pass.Rd b/man/dot-ftpes_download_two_pass.Rd new file mode 100644 index 0000000..e15d299 --- /dev/null +++ b/man/dot-ftpes_download_two_pass.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.ftpes_download_two_pass} +\alias{.ftpes_download_two_pass} +\title{Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and +if that fails, trying a longer timeout 2nd pass at the end in case it was a +hiccup. If 2nd pass fails, log and give up on that file.} +\usage{ +.ftpes_download_two_pass( + genome_ids, + out_dir, + workers_first = 4L, + workers_second = 4L, + log_file = NULL +) +} +\description{ +Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and +if that fails, trying a longer timeout 2nd pass at the end in case it was a +hiccup. If 2nd pass fails, log and give up on that file. +} +\keyword{internal} diff --git a/man/dot-generateDBname.Rd b/man/dot-generateDBname.Rd index bebcf43..30ce51e 100644 --- a/man/dot-generateDBname.Rd +++ b/man/dot-generateDBname.Rd @@ -26,5 +26,4 @@ For numeric taxon IDs, it prefixes them with "tid_". .generateDBname(c("90371", "Bacillus subtilis")) .generateDBname(c("12345", "Escherichia coli", "Lactobacillus")) } - -} +\keyword{internal} diff --git a/man/dot-id_checker.Rd b/man/dot-id_checker.Rd new file mode 100644 index 0000000..00fd8eb --- /dev/null +++ b/man/dot-id_checker.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.id_checker} +\alias{.id_checker} +\title{Helps ensure trailing 0s are retained in genome IDs for proper downloading} +\usage{ +.id_checker(x) +} +\description{ +Helps ensure trailing 0s are retained in genome IDs for proper downloading +} +\keyword{internal} diff --git a/man/dot-is_complete_set.Rd b/man/dot-is_complete_set.Rd new file mode 100644 index 0000000..9bd4758 --- /dev/null +++ b/man/dot-is_complete_set.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.is_complete_set} +\alias{.is_complete_set} +\title{Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff)} +\usage{ +.is_complete_set(dir, genomeID, min_bytes = 100) +} +\description{ +Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff) +} +\keyword{internal} diff --git a/man/dot-list_complete.Rd b/man/dot-list_complete.Rd new file mode 100644 index 0000000..f473700 --- /dev/null +++ b/man/dot-list_complete.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.list_complete} +\alias{.list_complete} +\title{Helps collate completed genomes into a set} +\usage{ +.list_complete(dir, genome_ids, min_bytes = 100) +} +\description{ +Helps collate completed genomes into a set +} +\keyword{internal} diff --git a/man/dot-pick_shell.Rd b/man/dot-pick_shell.Rd new file mode 100644 index 0000000..fb4e8bf --- /dev/null +++ b/man/dot-pick_shell.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_curation.R +\name{.pick_shell} +\alias{.pick_shell} +\title{Helps run a shell inside a container, and prefers bash (don't we all?)} +\usage{ +.pick_shell(image) +} +\description{ +Helps run a shell inside a container, and prefers bash (don't we all?) +} +\keyword{internal} diff --git a/man/dot-retrieveQueryIDs.Rd b/man/dot-retrieveQueryIDs.Rd index 094dde9..e8fbcce 100644 --- a/man/dot-retrieveQueryIDs.Rd +++ b/man/dot-retrieveQueryIDs.Rd @@ -6,21 +6,11 @@ \usage{ .retrieveQueryIDs(base_dir = ".", user_bacs, overwrite = FALSE, verbose = TRUE) } -\arguments{ -\item{base_dir}{Character. Project root directory. Default ".".} - -\item{user_bacs}{Character vector. Mixed inputs of taxon IDs and/or species names.} - -\item{overwrite}{Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE.} - -\item{verbose}{Logical.} -} \value{ -A numeric vector of distinct \code{genome.genome_id}, or NULL if none found. +A character vector of distinct \code{genome.genome_id}, or NULL if none found. } \description{ -Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID, -and returns distinct genome IDs. Uses a per-selection DuckDB located under: -/data//\if{html}{\out{}}.duckdb -BV-BRC column names are preserved. +Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then +returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data' +table to the per-selection DuckDB with a cache snapshot of core fields. } diff --git a/man/prepareGenomes.Rd b/man/prepareGenomes.Rd index 204f3f6..a764058 100644 --- a/man/prepareGenomes.Rd +++ b/man/prepareGenomes.Rd @@ -9,6 +9,7 @@ prepareGenomes( base_dir = ".", method = c("ftp", "cli"), overwrite = FALSE, + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE ) } @@ -24,6 +25,10 @@ prepareGenomes( \item{overwrite}{Logical. Passed to metadata filtering and DuckDB creation. Default FALSE.} +\item{evidence_mode}{Character. Sets what types of AMR evidence is acceptable. +Default \code{lab_only}. \code{any} will not require AMR data for downloads. This will +return very large download lists for many species!} + \item{verbose}{Logical. Print progress messages. Default TRUE.} } \value{ diff --git a/man/retrieveGenomes.Rd b/man/retrieveGenomes.Rd index 4006e94..1800b5d 100644 --- a/man/retrieveGenomes.Rd +++ b/man/retrieveGenomes.Rd @@ -10,10 +10,11 @@ retrieveGenomes( method = c("ftp", "cli"), image = "danylmb/bvbrc:5.3", skip_existing = TRUE, - ftp_workers = 8L, + ftp_workers = 4L, cli_fasta_workers = 4L, cli_gff_workers = 4L, chunk_size = 50L, + evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), verbose = TRUE ) } diff --git a/man/retrieveMetadata.Rd b/man/retrieveMetadata.Rd index e450626..6c92a36 100644 --- a/man/retrieveMetadata.Rd +++ b/man/retrieveMetadata.Rd @@ -44,6 +44,6 @@ Tables written: \itemize{ \item amr_phenotype \item genome_data -\item metadata (inner join on genome ID field names as returned by BV-BRC) +\item metadata (join on genome IDs returned by BV-BRC) } }