diff --git a/NAMESPACE b/NAMESPACE
index a6b5194..786ac51 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,8 @@
 export(.extractAMRtable)
 export(.updateBVBRCdata)
 export(CDHIT2duckdb)
+export(cleanData)
+export(cleanMetaData)
 export(prepareGenomes)
 export(retrieveGenomes)
 export(retrieveMetadata)
diff --git a/R/data_curation.R b/R/data_curation.R
index 6e71824..b58cf3c 100644
--- a/R/data_curation.R
+++ b/R/data_curation.R
@@ -1,3 +1,177 @@
+#' Helps ensure trailing 0s are retained in genome IDs for proper downloading
+#' @keywords internal
+.id_checker <- function(x) {
+  # Taxon IDs are just numbers, genome IDs have decimals, this tells them apart
+  grepl("^[0-9]+$", x)
+}
+
+#' Helps tag genomes with their AMR evidence for parsing
+#' @keywords internal
+.create_amr_tagged_view <- function(con) {
+  lab_methods <- c("Disk diffusion", "MIC", "Broth dilution", "Agar dilution")
+  lab_list_sql <- paste(DBI::dbQuoteString(con, lab_methods), collapse = ", ")
+  comp_str_sql <- DBI::dbQuoteString(con, "Computational Method")
+
+  DBI::dbExecute(
+    con,
+    sprintf(
+      "CREATE OR REPLACE VIEW amr_phenotype_tagged AS
+       SELECT
+         *,
+         CASE WHEN \"genome_drug.laboratory_typing_method\" IN (%s) THEN 1 ELSE 0 END AS is_lab_row,
+         CASE WHEN
+           (\"genome_drug.evidence\" = %s)
+           OR (COALESCE(\"genome_drug.computational_method\", '') <> '')
+           OR (\"genome_drug.laboratory_typing_method\" = 'Computational Prediction')
+         THEN 1 ELSE 0 END AS is_comp_row
+       FROM amr_phenotype",
+      lab_list_sql, comp_str_sql
+    )
+  )
+}
+
+#' Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck
+#' when malformed files can hang an FTPS connection by introducing safeguards
+#' @keywords internal
+.ftpes_download_one <- function(genomeID, out_dir,
+                                connect_timeout = 10L,
+                                max_time = 30L,
+                                speed_time = 30L, # end if <speed_limit for >speed_time
+                                speed_limit = 2048L, # B/s
+                                min_bytes = 100L) {
+  dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
+
+  exts <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
+  for (ext in exts) {
+    dest <- file.path(out_dir, paste0(genomeID, ext))
+    dest_tmp <- paste0(dest, ".tmp")
+
+    # Skip if we already completed these
+    if (file.exists(dest) && file.info(dest)$size > min_bytes) next
+    if (file.exists(dest) && file.info(dest)$size == 0) try(unlink(dest), silent = TRUE)
+    if (file.exists(dest_tmp)) try(unlink(dest_tmp), silent = TRUE)
+
+    url <- sprintf("ftp://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, ext)
+
+    args <- c(
+      "--fail", "--silent", "--show-error", "-L",
+      "--connect-timeout", as.character(connect_timeout),
+      "--max-time", as.character(max_time),
+      "--speed-time", as.character(speed_time),
+      "--speed-limit", as.character(speed_limit),
+      "--ftp-ssl", # AUTH TLS on port 21 works in testing
+      "--ftp-pasv", "--disable-epsv", "--ipv4",
+      "--user", "anonymous:",
+      "-o", shQuote(dest_tmp), shQuote(url)
+    )
+
+    res <- suppressWarnings(system2("curl", args = args, stdout = TRUE, stderr = TRUE))
+    status <- attr(res, "status")
+    if (is.null(status)) status <- 0L
+
+    # Avoiding 0B files cluttering up results -- atomic rename on successful tmp DL
+    ok <- (status == 0L && file.exists(dest_tmp) && file.info(dest_tmp)$size > min_bytes)
+    if (ok) {
+      if (!file.rename(dest_tmp, dest)) {
+        # If rename fails for some reason, copy + unlink
+        file.copy(dest_tmp, dest, overwrite = TRUE)
+        unlink(dest_tmp)
+      }
+    } else {
+      try(unlink(dest_tmp), silent = TRUE)
+      return(FALSE) # Abort early, don't accept partial set
+    }
+  }
+
+  # Do we have all 3 present?
+  .is_complete_set(out_dir, genomeID, min_bytes = min_bytes)
+}
+
+#' Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and
+#' if that fails, trying a longer timeout 2nd pass at the end in case it was a
+#' hiccup. If 2nd pass fails, log and give up on that file.
+#' @keywords internal
+.ftpes_download_two_pass <- function(genome_ids, out_dir,
+                                     workers_first = 4L,
+                                     workers_second = 4L,
+                                     log_file = NULL) {
+  genome_ids <- unique(as.character(genome_ids))
+  if (!length(genome_ids)) {
+    return(character(0))
+  }
+
+  if (!is.null(log_file)) {
+    dir.create(dirname(log_file), recursive = TRUE, showWarnings = FALSE)
+    cat(sprintf("[%s] FTPS run start: %d genomes\n", Sys.time(), length(genome_ids)),
+      file = log_file, append = TRUE
+    )
+  }
+
+  # Pass 1: 30s per-file cap
+  message("FTPS pass 1 (45s timeout)")
+  future::plan(future::multisession, workers = max(1, workers_first))
+  res1 <- future.apply::future_lapply(
+    genome_ids,
+    function(gid) {
+      ok <- .ftpes_download_one(gid, out_dir,
+        connect_timeout = 10L, max_time = 45L,
+        speed_time = 30L, speed_limit = 2048L
+      )
+      list(gid = gid, ok = ok)
+    },
+    future.seed = TRUE
+  )
+  ok1 <- vapply(res1, `[[`, logical(1), "ok")
+  ok_ids_1 <- genome_ids[ok1]
+  fail_ids <- genome_ids[!ok1]
+  message(sprintf("Pass 1: ok=%d, fail=%d", length(ok_ids_1), length(fail_ids)))
+  if (!is.null(log_file)) {
+    cat(sprintf("[%s] Pass1 ok=%d fail=%d\n", Sys.time(), length(ok_ids_1), length(fail_ids)),
+      file = log_file, append = TRUE
+    )
+  }
+
+  if (!length(fail_ids)) {
+    if (!is.null(log_file)) {
+      cat(sprintf("[%s] FTPS run end: all OK\n", Sys.time()),
+        file = log_file, append = TRUE
+      )
+    }
+    return(ok_ids_1)
+  }
+
+  # Pass 2: 60s per-file cap where we retry any failures
+  message("FTPS pass 2 (120s timeout) for failed genomes")
+  future::plan(future::multisession, workers = max(1, workers_second))
+  res2 <- future.apply::future_lapply(
+    fail_ids,
+    function(gid) {
+      ok <- .ftpes_download_one(gid, out_dir,
+        connect_timeout = 10L, max_time = 120L,
+        speed_time = 30L, speed_limit = 2048L
+      )
+      list(gid = gid, ok = ok)
+    },
+    future.seed = TRUE
+  )
+  ok2 <- vapply(res2, `[[`, logical(1), "ok")
+  ok_ids_2 <- fail_ids[ok2]
+  still_fail <- setdiff(fail_ids, ok_ids_2)
+  message(sprintf("Pass 2: ok=%d, still_fail=%d", length(ok_ids_2), length(still_fail)))
+  if (!is.null(log_file)) {
+    cat(sprintf("[%s] Pass2 ok=%d still_fail=%d\n", Sys.time(), length(ok_ids_2), length(still_fail)),
+      file = log_file, append = TRUE
+    )
+    if (length(still_fail)) {
+      cat("Fail IDs (excluded): ", paste(head(still_fail, 50), collapse = ", "), "\n",
+        file = log_file, append = TRUE
+      )
+    }
+  }
+
+  unique(c(ok_ids_1, ok_ids_2))
+}
+
 #' Fetch BV-BRC bacterial genome data
 #'
 #' Run the BV-BRC CLI (`p3-all-genomes`) inside a Docker image and return results as a tibble.
@@ -26,7 +200,8 @@
     "--in superkingdom,Bacteria ",
     "--eq genome_quality,Good ",
     "--in genome_status,WGS,Complete ",
-    "--attr genome_id,genome_name,taxon_id,species,strain,_version_,",
+    "--attr genome_id,genome_name,genome_quality,genome_status,",
+    "taxon_id,species,strain,_version_,",
     "collection_year,state_province,latitude,longitude,",
     "antimicrobial_resistance_evidence,assembly_accession,isolation_country,",
     "isolation_source,disease,host_common_name"
@@ -44,10 +219,10 @@
     stop("BV-BRC returned no data. The service may be unavailable.")
   }
 
-  # Parse
+  # Parse and coerce to character to avoid numeric interpretation losing trailing 0s
   df <- utils::read.table(
     text = raw_data, sep = "\t", header = TRUE, fill = TRUE,
-    quote = "", check.names = FALSE, comment.char = ""
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
   )
   df <- tibble::as_tibble(df) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = "")))
@@ -230,7 +405,7 @@
   )
 
   for (user_bac in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(user_bac)))) {
+    if (.id_checker(user_bac)) {
       message("Numeric input detected: ", user_bac)
       if (user_bac %in% bvbrc_bacs$genome.taxon_id) {
         bac_name <- bvbrc_bacs$genome.species[bvbrc_bacs$genome.taxon_id == user_bac]
@@ -294,10 +469,11 @@
     return(cand)
   }
   if (query_type == "taxon_id") {
-    nums <- suppressWarnings(!is.na(as.numeric(user_bacs)))
+    nums <- .id_checker(user_bacs)
     if (!any(nums)) stop("Cannot infer taxon_id from user_bacs. Provide query_value.")
     return(as.character(user_bacs[which(nums)[1]]))
   }
+
   stop("Unsupported query_type: ", query_type)
 }
 
@@ -325,7 +501,7 @@
   db_parts <- c()
 
   for (user_bac in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(user_bac)))) {
+    if (.id_checker(user_bac)) {
       db_parts <- c(db_parts, paste0("tid_", user_bac))
     } else {
       parts <- stringr::str_split(user_bac, " ")[[1]]
@@ -440,7 +616,7 @@
   data_result <- tibble::as_tibble(
     utils::read.table(
       text = data_raw, sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
+      quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
     )
   ) |>
     dplyr::mutate(
@@ -465,17 +641,11 @@
 
 #' Retrieve genome IDs for each taxon via BV-BRC and DuckDB
 #'
-#' Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID,
-#' and returns distinct genome IDs. Uses a per-selection DuckDB located under:
-#'   <base_dir>/data/<bug_dir>/<abbrev>.duckdb
-#' BV-BRC column names are preserved.
+#' Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then
+#' returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data'
+#' table to the per-selection DuckDB with a cache snapshot of core fields.
 #'
-#' @param base_dir Character. Project root directory. Default ".".
-#' @param user_bacs Character vector. Mixed inputs of taxon IDs and/or species names.
-#' @param overwrite Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE.
-#' @param verbose Logical.
-#'
-#' @return A numeric vector of distinct `genome.genome_id`, or NULL if none found.
+#' @return A character vector of distinct `genome.genome_id`, or NULL if none found.
 .retrieveQueryIDs <- function(base_dir = ".",
                               user_bacs,
                               overwrite = FALSE,
@@ -490,54 +660,63 @@
     return(NULL)
   }
 
-  # Resolve per-bug DB path
+  # Per-selection DB path
   paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
 
-  bac_data <- tibble::tibble()
-  taxon_ids <- unique(bac_input_data$genome.taxon_id)
-
-  if (isTRUE(verbose)) message("Querying BV-BRC for ", length(taxon_ids), " taxon IDs.")
-
-  for (i in seq_along(taxon_ids)) {
-    tax <- taxon_ids[i]
-    if (isTRUE(verbose)) message("Taxon ", i, "/", length(taxon_ids), ": ", tax)
-
-    res <- .getGenomeIDs(
-      base_dir    = base_dir,
-      query_type  = "taxon_id",
-      query_value = as.character(tax),
-      user_bacs   = user_bacs,
-      overwrite   = TRUE,
-      verbose     = verbose
-    )
-
-    con <- res$duckdbConnection
-    tbl <- res$table_name
-    each_bac_data <- tibble::as_tibble(DBI::dbReadTable(con, tbl))
-    bac_data <- dplyr::bind_rows(bac_data, each_bac_data)
-
-    # Close per-iteration connection to avoid open handles piling up
-    try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE)
+  # Got a cache? Use that, it's fast
+  cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb")
+  if (!file.exists(cache_db)) {
+    stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.")
   }
+  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE)
+  on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE)
 
-  if (nrow(bac_data) > 0) {
-    genome_ids <- bac_data |>
-      dplyr::distinct(`genome.genome_id`) |>
-      dplyr::pull(`genome.genome_id`)
-    genome_ids <- genome_ids[!is.na(genome_ids)]
+  taxon_ids <- unique(bac_input_data$genome.taxon_id)
+  if (isTRUE(verbose)) message("Querying cache for ", length(taxon_ids), " taxon IDs.")
+
+  taxon_list_sql <- paste(DBI::dbQuoteString(con_cache, taxon_ids), collapse = ", ")
+  sql_ids <- sprintf(
+    "SELECT DISTINCT
+       \"genome.genome_id\"   AS gid,
+       \"genome.genome_name\" AS genome_name,
+       \"genome.taxon_id\"    AS taxon_id,
+       \"genome.species\"     AS species,
+       \"genome.strain\"      AS strain
+     FROM bvbrc_bac_data
+     WHERE \"genome.taxon_id\" IN (%s)
+       AND \"genome.genome_quality\" = 'Good'
+       AND \"genome.genome_status\"  IN ('WGS','Complete')",
+    taxon_list_sql
+  )
+  cache_rows <- DBI::dbGetQuery(con_cache, sql_ids)
+  valid_id_re <- "^[0-9]+\\.[0-9]+$"
+  cache_rows <- cache_rows[grepl(valid_id_re, cache_rows$gid), , drop = FALSE]
 
-    if (length(genome_ids) > 0) {
-      if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs.")
-      return(genome_ids)
-    } else {
-      message("No valid genome IDs found.")
-      return(NULL)
-    }
-  } else {
-    message("No valid genome data found for any input.")
+  if (nrow(cache_rows) == 0L) {
+    message("No valid genome IDs found.")
     return(NULL)
   }
+
+  genome_ids <- unique(cache_rows$gid)
+  if (isTRUE(verbose)) message("Collected ", length(genome_ids), " distinct genome IDs (cache).")
+
+  # Write 'bac_data' from cache
+  con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
+
+  bac_data_tbl <- data.frame(
+    `genome.genome_id` = cache_rows$gid,
+    `genome.genome_name` = cache_rows$genome_name,
+    `genome.taxon_id` = cache_rows$taxon_id,
+    `genome.species` = cache_rows$species,
+    `genome.strain` = cache_rows$strain,
+    stringsAsFactors = FALSE
+  )
+  DBI::dbWriteTable(con, "bac_data", bac_data_tbl, overwrite = TRUE)
+  if (isTRUE(verbose)) message("Wrote table 'bac_data' to: ", db_path)
+
+  genome_ids
 }
 
 #' Extract AMR Data Table
@@ -688,7 +867,7 @@
   writeLines(genome_ids_output, con = tmp_in)
   tmp_in_mounted <- file.path("/data", "tmp", basename(tmp_in))
 
-  # Choose attributes (AMR for this pipeline)
+  # Choose attributes (AMR for this workflow)
   chosen_fields <- if (identical(filter_type, "AMR")) amr_fields else microtrait_fields
 
   get_args <- c(
@@ -710,7 +889,6 @@
   return(genome_data)
 }
 
-
 #' Retrieve AMR or Microtrait metadata from BV-BRC and store in DuckDB
 #'
 #' Queries BV-BRC for AMR or microtrait metadata for genomes corresponding to user inputs.
@@ -719,7 +897,7 @@
 #' Tables written:
 #'   - amr_phenotype
 #'   - genome_data
-#'   - metadata (inner join on genome ID field names as returned by BV-BRC)
+#'   - metadata (join on genome IDs returned by BV-BRC)
 #'
 #' @param user_bacs Character vector. Mixed taxon IDs and/or species strings (used for naming).
 #' @param filter_type Character. "AMR" or "microTraits". Default "AMR".
@@ -752,7 +930,7 @@ retrieveMetadata <- function(user_bacs,
     return(NULL)
   }
 
-  # Fields (unchanged BV-BRC names)
+  # Desired fields from BV-BRC
   drug_fields <- paste0(
     "antibiotic,computational_method,",
     "evidence,genome_name,id,",
@@ -802,7 +980,7 @@ retrieveMetadata <- function(user_bacs,
     "trna"
   )
 
-  # Batching
+  # Batching downloads and parallel implementation
   batch_size <- 500L
   genome_ids <- as.character(genome_ids)
   genome_batches <- split(genome_ids, ceiling(seq_along(genome_ids) / batch_size))
@@ -810,6 +988,7 @@ retrieveMetadata <- function(user_bacs,
   n_cores <- max(1L, parallel::detectCores(logical = TRUE) - 1L)
   param <- BiocParallel::SnowParam(workers = n_cores)
 
+  # Pull AMR metadata
   if (isTRUE(verbose)) message("Retrieving AMR phenotype data in batches.")
   batch_drug_data <- BiocParallel::bplapply(genome_batches, function(batch) {
     .extractAMRtable(
@@ -826,17 +1005,15 @@ retrieveMetadata <- function(user_bacs,
     message("No drug data returned.")
     return(NULL)
   }
-
-  combined_drug_data_tbl <- tibble::as_tibble(
-    utils::read.table(
-      text = combined_drug_data,
-      sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
-    )
-  ) |>
+  combined_drug_data_tbl <- tibble::as_tibble(utils::read.table(
+    text = combined_drug_data,
+    sep = "\t", header = TRUE, fill = TRUE,
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
+  )) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |>
     dplyr::mutate(`genome_drug.genome_id` = as.character(`genome_drug.genome_id`))
 
+  # Pull genome metadata
   if (isTRUE(verbose)) message("Retrieving genome metadata in batches.")
   batch_genome_data <- BiocParallel::bplapply(genome_batches, function(batch) {
     .extractGenomeData(
@@ -854,21 +1031,17 @@ retrieveMetadata <- function(user_bacs,
     message("No genome data returned.")
     return(NULL)
   }
-
-  combined_genome_data_tbl <- tibble::as_tibble(
-    utils::read.table(
-      text = combined_genome_data,
-      sep = "\t", header = TRUE, fill = TRUE,
-      quote = "", check.names = FALSE, comment.char = ""
-    )
-  ) |>
+  combined_genome_data_tbl <- tibble::as_tibble(utils::read.table(
+    text = combined_genome_data,
+    sep = "\t", header = TRUE, fill = TRUE,
+    quote = "", check.names = FALSE, comment.char = "", colClasses = "character"
+  )) |>
     dplyr::mutate(dplyr::across(dplyr::everything(), ~ iconv(.x, from = "", to = "UTF-8", sub = ""))) |>
     dplyr::mutate(`genome.genome_id` = as.character(`genome.genome_id`))
 
-  # Per-bug DB path
+  # Write & join
   paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs, overwrite = overwrite)
   db_path <- paths$db_path
-
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
   cat(sprintf("[%s] Writing metadata DuckDB: %s\n", Sys.time(), db_path),
@@ -876,27 +1049,69 @@ retrieveMetadata <- function(user_bacs,
   )
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = TRUE), silent = TRUE), add = TRUE)
+
   DBI::dbWriteTable(con, "amr_phenotype", combined_drug_data_tbl, overwrite = TRUE)
   DBI::dbWriteTable(con, "genome_data", combined_genome_data_tbl, overwrite = TRUE)
 
   if (isTRUE(verbose)) message("Joining AMR phenotype and genome metadata.")
-  initial_metadata <- tibble::as_tibble(DBI::dbGetQuery(
+  DBI::dbExecute(con, '
+    CREATE OR REPLACE TABLE metadata AS
+    SELECT *
+    FROM amr_phenotype
+    INNER JOIN genome_data
+      ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"
+  ')
+
+  # Debug summary after writes
+  n_targets <- length(genome_ids)
+  n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
+  if (isTRUE(verbose)) {
+    message(
+      "Initial summary: targets=", n_targets,
+      " | AMR genomes=", n_amr_ids,
+      " | genome_data genomes=", n_gmeta_ids
+    )
+  }
+
+  # Tagged view
+  .create_amr_tagged_view(con)
+
+  # Final debug summary
+  n_bac <- if ("bac_data" %in% DBI::dbListTables(con)) {
+    DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM bac_data')$n
+  } else {
+    NA_integer_
+  }
+  n_amr_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n
+  n_gmeta_ids <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id")      AS n FROM genome_data')$n
+
+  ids_zero_amr <- DBI::dbGetQuery(
     con,
-    'SELECT *
-     FROM amr_phenotype
-     INNER JOIN genome_data
-     ON amr_phenotype."genome_drug.genome_id" = genome_data."genome.genome_id"'
-  ))
-  DBI::dbWriteTable(con, "metadata", initial_metadata, overwrite = TRUE)
+    'WITH sel AS (SELECT DISTINCT "genome.genome_id" AS gid FROM genome_data),
+          got AS (SELECT DISTINCT "genome_drug.genome_id" AS gid FROM amr_phenotype)
+     SELECT sel.gid
+     FROM sel LEFT JOIN got USING (gid)
+     WHERE got.gid IS NULL
+     ORDER BY sel.gid'
+  )$gid
 
   if (isTRUE(verbose)) {
-    message("Wrote tables 'amr_phenotype', 'genome_data', and 'metadata' to: ", db_path)
+    message("Final summary:")
+    message("  targets      : ", length(genome_ids))
+    message("  bac_data     : ", n_bac)
+    message("  genome_data  : ", n_gmeta_ids)
+    message("  amr_phenotype: ", n_amr_ids)
+    message("  genomes with 0 AMR rows: ", length(ids_zero_amr))
+    if (length(ids_zero_amr)) {
+      message("  e.g.: ", paste(utils::head(ids_zero_amr, 10), collapse = ", "))
+    }
   }
 
   list(duckdbConnection = con, table_name = "metadata")
 }
 
-
 # FASTA sanitizer to ensure Panaroo compatibility with BV-BRC CLI downloads
 .strip_fasta_preamble <- function(fna_path) {
   if (!file.exists(fna_path)) {
@@ -954,37 +1169,75 @@ retrieveMetadata <- function(user_bacs,
 #' derive genome IDs from user_bacs (taxon IDs or species substring), and
 #' write a minimal "filtered" table (without AMR evidence filtering).
 #'
+#' @param evidence_mode Character. Either "lab_only" (default), "lab_or_comp" (all),
+#' "comp_only" (BV-BRC-predicted without lab labels), or "any" (no AMR data required).
+#' @return A list with a DuckDB connection and table_name = "filtered"
+#' Filter genomes by AMR phenotype and metadata, and store results in DuckDB
+#'
+#' Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and
+#' apply evidence & genome_quality filters.
+#'
+#' Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE,
+#' read BV-BRC cache at <base_dir>/data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"),
+#' derive genome IDs from user_bacs (taxon IDs or species substring), and
+#' write a minimal "filtered" table (without AMR evidence filtering).
+#'
+#' @param evidence_mode Character. One of:
+#'   "lab_only"   (default) -> only laboratory evidence
+#'   "lab_or_comp"          -> laboratory OR computational evidence
+#'   "comp_only"            -> only computational evidence
+#'   "any"                  -> no AMR required (from genome_data; Good only)
 #' @return A list with a DuckDB connection and table_name = "filtered"
 .filterGenomes <- function(user_bacs,
                            base_dir = ".",
+                           evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
                            verbose = TRUE,
                            fallback_to_bvbrc_cache = TRUE) {
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
   paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
   db_path <- paths$db_path
 
   con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
-
   on.exit(
     {
-      # Keep connection open for caller
       NULL
     },
     add = TRUE
-  )
+  ) # keep open for caller
 
-  # Good metadata present? Apply AMR filters
+  # The convenient "Metadata Exists" path
   if ("metadata" %in% DBI::dbListTables(con)) {
     if (isTRUE(verbose)) message("Loading metadata for filtering.")
-    initial_metadata <- DBI::dbReadTable(con, "metadata")
-    if (is.null(initial_metadata) || nrow(initial_metadata) == 0) {
+
+    # If "any", AMR data not needed, just fetch qualifying genome_data
+    if (evidence_mode == "any") {
+      gd <- DBI::dbReadTable(con, "genome_data")
+      if (is.null(gd) || nrow(gd) == 0) {
+        DBI::dbDisconnect(con, shutdown = TRUE)
+        stop("No data available in 'genome_data'.")
+      }
+      gd <- tibble::as_tibble(gd) |>
+        dplyr::filter(`genome.genome_quality` == "Good") |>
+        dplyr::distinct(`genome.genome_id`)
+      # Keep BV-BRC column name
+      DBI::dbWriteTable(con, "filtered", gd, overwrite = TRUE)
+      if (isTRUE(verbose)) {
+        message("Post-filter distinct genomes (any): ", nrow(gd))
+        message("Wrote table 'filtered' to: ", db_path)
+      }
+      return(list(duckdbConnection = con, table_name = "filtered"))
+    }
+
+    # Otherwise, open up the metadata and start interrogating the AMR data
+    md <- DBI::dbReadTable(con, "metadata")
+    if (is.null(md) || nrow(md) == 0) {
       DBI::dbDisconnect(con, shutdown = TRUE)
       message("No data available in 'metadata'.")
       return(NULL)
     }
 
-    # Normalize evidence labels
-    initial_metadata <- tibble::as_tibble(initial_metadata) |>
+    md <- tibble::as_tibble(md) |>
       dplyr::mutate(
         `genome_drug.evidence` = dplyr::case_when(
           `genome_drug.laboratory_typing_method` %in%
@@ -994,29 +1247,39 @@ retrieveMetadata <- function(user_bacs,
         )
       )
 
-    # AMR and quality filtering
-    filtered_metadata <- initial_metadata |>
-      dplyr::filter(`genome_drug.evidence` == "Laboratory Method") |>
+    if (evidence_mode == "lab_only") {
+      md <- dplyr::filter(md, `genome_drug.evidence` == "Laboratory Method")
+    } else if (evidence_mode == "comp_only") {
+      md <- dplyr::filter(
+        md,
+        `genome_drug.evidence` == "Computational Method" |
+          (!is.na(`genome_drug.computational_method`) & nzchar(`genome_drug.computational_method`)) |
+          (`genome_drug.laboratory_typing_method` == "Computational Prediction")
+      )
+    } else {
+      # Lab_or_comp: Doesn't matter, keep any predictions
+      md <- md
+    }
+
+    md <- md |>
       dplyr::filter(`genome.genome_quality` == "Good") |>
-      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate"))
+      dplyr::filter(`genome_drug.resistant_phenotype` %in% c("Resistant", "Susceptible", "Intermediate")) |>
+      dplyr::distinct(`genome.genome_id`) # keep BV-BRC column name
 
-    DBI::dbWriteTable(con, "filtered", filtered_metadata, overwrite = TRUE)
+    DBI::dbWriteTable(con, "filtered", md, overwrite = TRUE)
     if (isTRUE(verbose)) {
-      message("Post-filter rows: ", nrow(filtered_metadata))
+      message("Post-filter distinct genomes: ", nrow(md))
       message("Wrote table 'filtered' to: ", db_path)
     }
     return(list(duckdbConnection = con, table_name = "filtered"))
   }
 
-  # Attempt BV-BRC cache location
+  # No metadata fallback
   if (!isTRUE(fallback_to_bvbrc_cache)) {
     DBI::dbDisconnect(con, shutdown = TRUE)
     stop("No 'metadata' table found in ", db_path, ". Run retrieveMetadata() first.")
   }
-
-  if (isTRUE(verbose)) {
-    message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.")
-  }
+  if (isTRUE(verbose)) message("No 'metadata' in per-selection DB. Falling back to BV-BRC cache at data/bvbrc/.")
 
   cache_db <- file.path(base_dir, "data", "bvbrc", "bvbrcData.duckdb")
   if (!file.exists(cache_db)) {
@@ -1024,7 +1287,7 @@ retrieveMetadata <- function(user_bacs,
     stop("BV-BRC cache not found at: ", cache_db, ". Run .updateBVBRCdata() first.")
   }
 
-  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db)
+  con_cache <- DBI::dbConnect(duckdb::duckdb(), dbdir = cache_db, read_only = TRUE)
   on.exit(try(DBI::dbDisconnect(con_cache, shutdown = TRUE), silent = TRUE), add = TRUE)
 
   if (!"bvbrc_bac_data" %in% DBI::dbListTables(con_cache)) {
@@ -1033,30 +1296,22 @@ retrieveMetadata <- function(user_bacs,
   }
 
   bv <- tibble::as_tibble(DBI::dbReadTable(con_cache, "bvbrc_bac_data"))
-
-
-  # Derive matches from user_bacs (taxon IDs or species)
   sel <- tibble::tibble(`genome.genome_id` = character())
 
   for (v in user_bacs) {
-    if (suppressWarnings(!is.na(as.numeric(v)))) {
-      # numeric taxon_id match
-      matches <- bv[bv$`genome.taxon_id` == v |
-        bv$`genome.taxon_id` == as.numeric(v), , drop = FALSE]
+    if (.id_checker(v)) {
+      v_chr <- as.character(v)
+      matches <- bv[bv$`genome.taxon_id` == v_chr, , drop = FALSE]
     } else {
-      # species substring (case-insensitive)
       matches <- bv[stringr::str_detect(
         bv$`genome.species`,
         stringr::fixed(v, ignore_case = TRUE)
       ), , drop = FALSE]
     }
-
     if (nrow(matches)) {
       sel <- dplyr::bind_rows(
         sel,
-        tibble::tibble(
-          `genome.genome_id` = as.character(matches$`genome.genome_id`)
-        )
+        tibble::tibble(`genome.genome_id` = as.character(matches$`genome.genome_id`))
       )
     }
   }
@@ -1067,37 +1322,14 @@ retrieveMetadata <- function(user_bacs,
     stop("No genomes matched user_bacs in BV-BRC cache.")
   }
 
-  # Minimal 'filtered' for downstream steps (downloaders & genomeList)
-  minimal_filtered <- sel
-  # Ensure expected column name used downstream:
-  # retrieveGenomes() reads "filtered" and expects `genome.genome_id` to exist.
-  DBI::dbWriteTable(con, "filtered", minimal_filtered, overwrite = TRUE)
-
-  if (isTRUE(verbose)) {
-    message("Wrote table 'filtered' to: ", db_path)
-  }
+  DBI::dbWriteTable(con, "filtered", sel, overwrite = TRUE)
+  if (isTRUE(verbose)) message("Wrote table 'filtered' to: ", db_path)
 
   list(duckdbConnection = con, table_name = "filtered")
 }
 
-
-# BV-BRC downloader block
-# Normalize Docker path
-.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE))
-
-# Prefer bash
-.pick_shell <- function(image) {
-  chk <- suppressWarnings(system2("docker",
-    c(
-      "run", "--rm", image, "sh", "-lc",
-      "command -v bash >/dev/null || echo NOBASH"
-    ),
-    stdout = TRUE, stderr = TRUE
-  ))
-  if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
-}
-
-# Check if a complete set (.fna + .PATRIC.faa + .PATRIC.gff) exists after DL
+#' Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff)
+#' @keywords internal
 .is_complete_set <- function(dir, genomeID, min_bytes = 100) {
   fna <- file.path(dir, paste0(genomeID, ".fna"))
   faa <- file.path(dir, paste0(genomeID, ".PATRIC.faa"))
@@ -1107,11 +1339,14 @@ retrieveMetadata <- function(user_bacs,
     all(purrr::map_dbl(paths, function(x) file.info(x)$size) > min_bytes)
 }
 
-# List genomes already completed
+#' Helps collate completed genomes into a set
+#' @keywords internal
 .list_complete <- function(dir, genome_ids, min_bytes = 100) {
   genome_ids[purrr::map_lgl(genome_ids, .is_complete_set, dir = dir, min_bytes = min_bytes)]
 }
 
+#' Helps in auditing downloaded files to ensure everything's complete per ID
+#' @keywords internal
 # Any genomes missing bits or pieces? Find em
 .missing_any <- function(dir, genome_ids, min_bytes = 100) {
   genome_ids[!purrr::map_lgl(genome_ids, .is_complete_set, dir = dir, min_bytes = min_bytes)]
@@ -1136,68 +1371,27 @@ retrieveMetadata <- function(user_bacs,
   df
 }
 
-# FTPS preferred! HTTPS as a fallback to see if it's just an FTPS issue
-.ftp_download_one <- function(genomeID, out_dir, tries = 3L, min_bytes = 100) {
-  files <- c(".fna", ".PATRIC.faa", ".PATRIC.gff")
-  dests <- file.path(out_dir, paste0(genomeID, files))
-  if (.is_complete_set(out_dir, genomeID, min_bytes)) {
-    return(TRUE)
-  }
-
-  get_one <- function(url, dest) {
-    if (nzchar(Sys.which("wget"))) {
-      res <- suppressWarnings(system2("wget",
-        c("-q", "-O", shQuote(dest), shQuote(url)),
-        stdout = TRUE, stderr = TRUE
-      ))
-      st <- attr(res, "status")
-      if (is.null(st)) st <- 0L
-      return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
-    } else if (nzchar(Sys.which("curl"))) {
-      curl_args <- if (startsWith(url, "ftps://")) {
-        c("--ssl-reqd", "-L", "-o", shQuote(dest), shQuote(url))
-      } else {
-        c("-L", "-o", shQuote(dest), shQuote(url))
-      }
-      res <- suppressWarnings(system2("curl", curl_args, stdout = TRUE, stderr = TRUE))
-      st <- attr(res, "status")
-      if (is.null(st)) st <- 0L
-      return(st == 0L && file.exists(dest) && file.info(dest)$size > min_bytes)
-    }
-    FALSE
-  }
+### BV-BRC CLI downloader [slower by comparison, but does not need FTP server]
 
-  for (ext_i in seq_along(files)) {
-    dest <- dests[ext_i]
-    if (file.exists(dest) && file.info(dest)$size > min_bytes) next
-    ftps <- sprintf("ftps://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
-    https <- sprintf("https://ftp.bv-brc.org/genomes/%s/%s%s", genomeID, genomeID, files[ext_i])
-
-    ok <- FALSE
-    for (k in 1:tries) {
-      if (get_one(ftps, dest)) {
-        ok <- TRUE
-        break
-      }
-    }
-    if (!ok) {
-      for (k in 1:2) {
-        if (get_one(https, dest)) {
-          ok <- TRUE
-          break
-        }
-      }
-    }
-    if (!ok) {
-      return(FALSE)
-    }
-  }
-  .is_complete_set(out_dir, genomeID, min_bytes)
-}
+#' Helps normalize Docker paths
+#' @keywords internal
+.docker_path <- function(p) gsub("\\\\", "/", normalizePath(p, mustWork = FALSE))
 
-# BV-BRC CLI downloader [very slow by comparison, but does not need FTP server]
+#' Helps run a shell inside a container, and prefers bash (don't we all?)
+#' @keywords internal
+.pick_shell <- function(image) {
+  chk <- suppressWarnings(system2("docker",
+    c(
+      "run", "--rm", image, "sh", "-lc",
+      "command -v bash >/dev/null || echo NOBASH"
+    ),
+    stdout = TRUE, stderr = TRUE
+  ))
+  if (length(chk) && any(grepl("NOBASH", chk))) "sh" else "bash"
+}
 
-# p3-dump-genomes to fetch FASTA and .gto files
+#' Using p3-dump-genomes in CLI to fetch FASTA and .gto files
+#' @keywords internal
 .cli_dump_fastas_gto_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
   out_dir <- normalizePath(out_dir, mustWork = FALSE)
   dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
@@ -1242,7 +1436,8 @@ retrieveMetadata <- function(user_bacs,
   st == 0L
 }
 
-# Export GFF from GTO per genomes in each chunk
+#' Exports GFF files from the downloaded GTO per genome in each chunk
+#' @keywords internal
 .cli_export_gff_chunk <- function(image, out_dir, genome_ids, tag, tries = 3L) {
   out_dir <- normalizePath(out_dir, mustWork = FALSE)
   ids_file <- file.path(out_dir, paste0("ids_", tag, ".txt"))
@@ -1333,36 +1528,48 @@ retrieveGenomes <- function(base_dir = ".",
                             method = c("ftp", "cli"),
                             image = "danylmb/bvbrc:5.3",
                             skip_existing = TRUE,
-                            ftp_workers = 8L,
+                            ftp_workers = 4L,
                             cli_fasta_workers = 4L,
                             cli_gff_workers = 4L,
                             chunk_size = 50L,
+                            evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"), # NEW
                             verbose = TRUE) {
   method <- match.arg(method)
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
-  # IDs from .filterGenomes()
-  if (isTRUE(verbose)) message("Filtering genomes before download.")
-  f_out <- .filterGenomes(base_dir = base_dir, user_bacs = user_bacs, verbose = verbose)
-  if (is.null(f_out)) {
-    return(character())
+  # Use 'filtered' if already prepared, or start filtering
+  if (isTRUE(verbose)) message("Preparing download set (checking for existing 'filtered').")
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  db_path <- paths$db_path
+  con0 <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path)
+  has_filtered <- "filtered" %in% DBI::dbListTables(con0)
+  if (has_filtered) {
+    if (isTRUE(verbose)) message("Using existing 'filtered' table (skipping re-filter).")
+    con <- con0
+    tbl <- "filtered"
+  } else {
+    if (isTRUE(verbose)) message("No 'filtered' table found; filtering now.")
+    f_out <- .filterGenomes(
+      base_dir = base_dir,
+      user_bacs = user_bacs,
+      evidence_mode = evidence_mode,
+      verbose = verbose
+    )
+    con <- f_out$duckdbConnection
+    tbl <- f_out$table_name
   }
 
-  con <- f_out$duckdbConnection
-  tbl <- f_out$table_name
   ids <- tibble::as_tibble(DBI::dbReadTable(con, tbl)) |>
     dplyr::distinct(`genome.genome_id`) |>
     dplyr::pull(`genome.genome_id`)
 
-  # Set up the paths and such
-  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
-  bug_dir <- dirname(paths$db_path)
+  bug_dir <- dirname(db_path)
   genome_path <- file.path(bug_dir, "genomes")
   logs_dir <- file.path(base_dir, "data", "logs")
   dir.create(genome_path, recursive = TRUE, showWarnings = FALSE)
   dir.create(logs_dir, recursive = TRUE, showWarnings = FALSE)
 
-  # Adding support for resuming a download
   if (isTRUE(skip_existing)) {
     already <- .list_complete(genome_path, ids)
     if (isTRUE(verbose)) message(length(already), " genomes already completed; skipping.")
@@ -1380,7 +1587,6 @@ retrieveGenomes <- function(base_dir = ".",
     return(all_complete)
   }
 
-  # FTP method -- preferred if server is available
   if (identical(method, "ftp")) {
     if (isTRUE(verbose)) message("Trying FTPS download. Workers=", ftp_workers)
     ftp_param <- BiocParallel::SnowParam(workers = max(1L, ftp_workers))
@@ -1534,6 +1740,9 @@ genomeList <- function(base_dir = ".",
 #'   `"ftp"` (default) or `"cli"`.
 #' @param overwrite Logical. Passed to metadata filtering and DuckDB creation.
 #'   Default FALSE.
+#' @param evidence_mode Character. Sets what types of AMR evidence is acceptable.
+#'    Default `lab_only`. `any` will not require AMR data for downloads. This will
+#'    return very large download lists for many species!
 #' @param verbose Logical. Print progress messages. Default TRUE.
 #'
 #' @return A list (the output of `genomeList()`), containing:
@@ -1545,31 +1754,66 @@ prepareGenomes <- function(user_bacs,
                            base_dir = ".",
                            method = c("ftp", "cli"),
                            overwrite = FALSE,
+                           evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
                            verbose = TRUE) {
   method <- match.arg(method)
+  evidence_mode <- match.arg(evidence_mode)
   base_dir <- normalizePath(base_dir, mustWork = FALSE)
 
-  # Ensure the BV-BRC metadata cache exists, fetch if missing
   .ensure_bvbrc_cache(base_dir = base_dir, verbose = verbose)
 
-  if (isTRUE(verbose)) message("Step 1: Downloading genomes from BV-BRC")
+  if (isTRUE(verbose)) message("Step 0: Building AMR metadata (retrieveMetadata)")
+  invisible(retrieveMetadata(
+    user_bacs = user_bacs,
+    filter_type = "AMR",
+    base_dir = base_dir,
+    abx = "All",
+    overwrite = overwrite,
+    verbose = verbose
+  ))
 
+  if (isTRUE(verbose)) message("Step 1: Filtering genomes for download by evidence: ", evidence_mode)
+  f_out <- .filterGenomes(
+    base_dir = base_dir,
+    user_bacs = user_bacs,
+    evidence_mode = evidence_mode,
+    verbose = verbose,
+    fallback_to_bvbrc_cache = FALSE
+  )
+  if (is.null(f_out)) {
+    message("No genomes available after evidence filtering.")
+    return(NULL)
+  }
+
+  # A little summary of what's left after filtering (or not)
+  paths <- .buildDBpath(base_dir = base_dir, user_bacs = user_bacs)
+  con <- DBI::dbConnect(duckdb::duckdb(), dbdir = paths$db_path, read_only = TRUE)
+  n_filtered <- DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM filtered')$n
+  n_meta <- if ("genome_data" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome.genome_id") AS n FROM genome_data')$n else NA
+  n_amr <- if ("amr_phenotype" %in% DBI::dbListTables(con)) DBI::dbGetQuery(con, 'SELECT COUNT(DISTINCT "genome_drug.genome_id") AS n FROM amr_phenotype')$n else NA
+  DBI::dbDisconnect(con, shutdown = TRUE)
+  if (isTRUE(verbose)) {
+    message(sprintf(
+      "Evidence filter summary: filtered=%d | genomes with AMR=%s | genomes with genome_data=%s",
+      n_filtered, ifelse(is.na(n_amr), "NA", n_amr), ifelse(is.na(n_meta), "NA", n_meta)
+    ))
+  }
+
+  if (isTRUE(verbose)) message("Step 2: Downloading genomes from BV-BRC (", method, ")")
   ids <- retrieveGenomes(
     base_dir      = base_dir,
     user_bacs     = user_bacs,
     method        = method,
     skip_existing = !overwrite,
+    evidence_mode = evidence_mode,
     verbose       = verbose
   )
-
   if (length(ids) == 0L) {
-    message("No genomes available after filtering.")
+    message("No genomes downloaded.")
     return(NULL)
   }
 
-  if (isTRUE(verbose)) message("Step 2: Formatting data into a database for further processing")
-
-  # List local files into per-selection DB
+  if (isTRUE(verbose)) message("Step 3: Formatting data into a database for further processing")
   out <- genomeList(
     base_dir  = base_dir,
     user_bacs = user_bacs,
@@ -1577,7 +1821,7 @@ prepareGenomes <- function(user_bacs,
   )
 
   if (isTRUE(verbose)) {
-    message("Done. Files are ready! Please continue to downstream processing by `runDataProcessing()`")
+    message("Done. Files are ready! Continue with downstream processing with runDataProcessing().")
   }
   out
 }
diff --git a/R/data_processing.R b/R/data_processing.R
index 512fdd4..17b7682 100644
--- a/R/data_processing.R
+++ b/R/data_processing.R
@@ -1,3 +1,6 @@
+#' @importFrom data.table :=
+NULL
+
 #' Normalize a host filesystem path for use in Docker
 #'
 #' Converts Windows and mixed-separator paths to forward slashes
@@ -405,12 +408,13 @@
       names_to = "genome_ids",
       values_to = "protein_ids"
     ) |>
-    dplyr::mutate(genome_ids = gsub(".PATRIC", "", genome_ids)) |>
+    dplyr::mutate(genome_ids = sub("\\.PATRIC\\.\\.\\..*$", "", genome_ids)) |>
     dplyr::select(genome_ids, Gene, protein_ids) |>
     dplyr::distinct() |>
     dplyr::filter(!is.na(protein_ids)) |>
     tidyr::separate_rows(protein_ids, sep = ";") |>
-    dplyr::filter(!stringr::str_detect(protein_ids, "_pseudo")) |>
+    # dplyr::filter(!stringr::str_detect(protein_ids, "_pseudo")) |>
+    dplyr::mutate(protein_ids = gsub("_pseudo", "", protein_ids)) |>
     DBI::dbWriteTable(conn = con, name = "genome_gene_protein", overwrite = TRUE)
 }
 
@@ -730,7 +734,7 @@ runPanaroo2Duckdb <- function(duckdb_path,
     # This finds the reference cluster ID and names the cluster with it
     ref_line <- grep("\\*$", cluster_lines, value = TRUE)
     ref_id <- if (length(ref_line) > 0) {
-      stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+")
+      stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+")
     } else {
       paste0("Cluster_", i - 1)
     }
@@ -738,7 +742,7 @@ runPanaroo2Duckdb <- function(duckdb_path,
     # Pull genome IDs
     genome_matches <- stringr::str_match(
       cluster_lines,
-      "fig\\|([0-9]+\\.[0-9]+)\\.peg\\.[0-9]+"
+      "fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+"
     )[, 2]
     genome_matches <- genome_matches[!is.na(genome_matches)]
 
@@ -753,6 +757,58 @@ runPanaroo2Duckdb <- function(duckdb_path,
   cluster_map
 }
 
+#' Parse CD-HIT `.clstr` output into a long-format mapping
+#'
+#' Reads a CD-HIT `.clstr` file and constructs a mapping of clusters to member feature ids.
+#'
+#' @param clustered_faa Base path to CD-HIT output (without `.clstr` extension).
+#'
+#' @return A tibble with columns `cluster` and `member`.
+#'
+#' @keywords internal
+.extractMembersInClusters <- function(clustered_faa) {
+  clstr <- paste0(clustered_faa, ".clstr")
+  if (!file.exists(clstr)) {
+    stop(
+      "CD-HIT cluster file not found: ", clstr,
+      "\nEnsure .runCDHIT() completed successfully and produced the .clstr file."
+    )
+  }
+
+  lines <- data.table::fread(clstr, sep = "\n", header = FALSE)$V1
+  cluster_ids <- grep("^>Cluster", lines)
+  cluster_member <- data.table::data.table()
+
+  for (i in seq_along(cluster_ids)) {
+    start <- cluster_ids[i] + 1
+    end <- if (i < length(cluster_ids)) cluster_ids[i + 1] - 1 else length(lines)
+    cluster_lines <- lines[start:end]
+
+    # This finds the reference cluster ID and names the cluster with it
+    ref_line <- grep("\\*$", cluster_lines, value = TRUE)
+    ref_id <- if (length(ref_line) > 0) {
+      stringr::str_extract(ref_line, "fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+")
+    } else {
+      paste0("Cluster_", i - 1)
+    }
+
+    # Pull genome IDs
+    members <- stringr::str_match(
+      cluster_lines,
+      "fig\\|([0-9]+\\.[0-9]+)\\.peg(?:sc)?\\.[0-9]+"
+    )[, 1]
+    members <- members[!is.na(members)]
+
+    if (length(members) > 0) {
+      cluster_member <- data.table::rbindlist(list(
+        cluster_member,
+        data.table::data.table(cluster = ref_id, member = members)
+      ), use.names = TRUE)
+    }
+  }
+
+  tibble::as_tibble(cluster_member)
+}
 
 #' Build genome-by-protein-cluster count matrix
 #'
@@ -796,8 +852,8 @@ buildMatrices <- function(cluster_map) .buildProtMatrices(cluster_map)
   names_faa <- names(cdhit_output_faa) |>
     tibble::as_tibble() |>
     dplyr::mutate(
-      proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
-      locus_tag = stringr::str_match(value, "peg\\.[0-9]+\\|([^\\s]+)")[, 2],
+      proteinID = stringr::str_extract(value, "^fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"),
+      locus_tag = stringr::str_match(value, "peg(?:sc)?\\.[0-9]+\\|([^\\s]+)")[, 2],
       proteinName = stringr::str_trim(stringr::str_match(value, "\\|[^\\s]+\\s+(.*?)\\s+\\[")[, 2])
     ) |>
     dplyr::select(-value)
@@ -846,11 +902,15 @@ CDHIT2duckdb <- function(duckdb_path,
   clustered_faa <- Biostrings::readAAStringSet(cdhit_outputs$clustered_faa)
   DBI::dbWriteTable(con, "protein_cluster_seq",
     tibble::tibble(
-      name     = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg\\.[0-9]+"),
+      name     = names(clustered_faa) |> stringr::str_extract("fig\\|[0-9]+\\.[0-9]+\\.peg(?:sc)?\\.[0-9]+"),
       sequence = as.character(clustered_faa)
     ),
     overwrite = TRUE
   )
+
+  cluster_member <- .extractMembersInClusters(cdhit_outputs$clustered_faa)
+  DBI::dbWriteTable(con, "protein_members", cluster_member, overwrite = TRUE)
+
   invisible(TRUE)
 }
 
@@ -1226,7 +1286,16 @@ domainFromIPR <- function(duckdb_path,
 }
 
 # Clean BV-BRC metadata, then save as Parquet files
-cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
+#'
+#' @param duckdb_path
+#' @param path
+#' @param ref_file_path
+#'
+#' @returns
+#' @export
+#'
+#' @examples
+cleanMetaData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
   duckdb_path <- normalizePath(duckdb_path)
   # If no explicit path is provided (or a generic one), choose results/<bug>/ when
   # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
@@ -1258,8 +1327,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
 
   dplyr::tbl(con, "filtered") |>
     tibble::as_tibble() |>
+    dplyr::select("genome.genome_id") |>
+    dplyr::left_join(dplyr::tbl(con, "metadata") |>
+      tibble::as_tibble(), by = dplyr::join_by("genome.genome_id" == "genome_drug.genome_id")) |>
     dplyr::select(
-      "genome_drug.genome_id", "genome_drug.antibiotic",
+      "genome.genome_id", "genome_drug.antibiotic",
       "genome_drug.genome_name", "genome_drug.laboratory_typing_method",
       "genome_drug.resistant_phenotype", "genome_drug.taxon_id",
       "genome_drug.pmid", "genome.collection_year",
@@ -1271,19 +1343,23 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
     dplyr::left_join(drug_class, by = c("cleaned_drug" = "drug")) |>
     dplyr::left_join(drug_abbr, by = c("cleaned_drug" = "drug")) |>
     dplyr::left_join(class_abbr, by = "drug_class") |>
-    DBI::dbWriteTable(conn = con, name = "filtered", overwrite = TRUE)
+    dplyr::filter(genome_drug.resistant_phenotype %in% c("Resistant", "Susceptible")) |>
+    DBI::dbWriteTable(conn = con, name = "filtered_metadata", overwrite = TRUE)
 
-  resistance_summary <- dplyr::tbl(con, "filtered") |>
-    tibble::as_tibble() |>
+  resistance_summary <- DBI::dbReadTable(con, "filtered_metadata") |>
     dplyr::filter(genome_drug.resistant_phenotype == "Resistant") |>
-    dplyr::group_by(genome_drug.genome_id) |>
+    dplyr::group_by(genome.genome_id) |>
     dplyr::summarise(
-      num_resistant_classes = dplyr::n_distinct(drug_class),
-      resistant_classes = paste(unique(class_abbr), collapse = "_")
+      resistant_classes = paste(sort(unique(class_abbr)), collapse = "_"),
+      .groups = "drop"
+    ) |>
+    dplyr::collect() |>
+    dplyr::mutate(
+      num_resistant_classes = stringr::str_count(resistant_classes, "_") + 1
     )
 
-  year_breaks <- seq(1980, 2023, by = 5)
-  dplyr::tbl(con, "filtered") |>
+  year_breaks <- seq(1980, 2026, by = 5)
+  dplyr::tbl(con, "filtered_metadata") |>
     tibble::as_tibble() |>
     dplyr::mutate(genome_drug.antibiotic = cleaned_drug) |>
     dplyr::select(-cleaned_drug) |>
@@ -1291,7 +1367,7 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
     dplyr::rename("cleaned_country" = "clean_name", "country_abbr" = "short_name") |>
     dplyr::mutate(genome.isolation_country = cleaned_country) |>
     dplyr::select(-cleaned_country) |>
-    dplyr::left_join(resistance_summary, by = "genome_drug.genome_id") |>
+    dplyr::left_join(resistance_summary, by = "genome.genome_id") |>
     dplyr::mutate(resistant_classes = dplyr::case_when(
       is.na(resistant_classes) ~ genome_drug.resistant_phenotype,
       TRUE ~ resistant_classes
@@ -1311,6 +1387,80 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
     )) |>
     DBI::dbWriteTable(conn = con, name = "cleaned_metadata", overwrite = TRUE)
 
+  # Parquet output path
+  metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata'
+
+  # Also export AMR/genome/original metadata
+  amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet")
+  genome_data_parquet <- file.path(path, "genome_data.parquet")
+  original_metadata_parquet <- file.path(path, "original_metadata.parquet")
+
+  writeCompressedParquet <- function(df, path) {
+    arrow::write_parquet(
+      df,
+      path,
+      compression = "zstd",
+      compression_level = 9,
+      use_dictionary = TRUE
+    )
+  }
+
+  db_name <- duckdb_path |>
+    stringr::str_split_i(".duckdb", i = 1) |>
+    paste0("_parquet.duckdb")
+  con_new <- DBI::dbConnect(duckdb::duckdb(), db_name)
+  on.exit(try(DBI::dbDisconnect(con_new, shutdown = FALSE), silent = TRUE), add = TRUE)
+
+  # Views below reference parquet files by bare filename. Point DuckDB at the
+  # parquet directory so schema inference at CREATE VIEW time can resolve them.
+  DBI::dbExecute(con_new, sprintf("SET file_search_path='%s'", path))
+
+  # cleaned_metadata -> parquet + view (as metadata)
+  DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet)
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", basename(metadata_parquet)))
+
+  # debug/complete views: amr_phenotype, genome_data, original_metadata
+  DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet)
+  DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet)
+  DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet)
+
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", basename(amr_phenotype_parquet)))
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", basename(genome_data_parquet)))
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", basename(original_metadata_parquet)))
+
+  invisible(TRUE)
+}
+
+# Clean feature matrices, then save as Parquet files
+#'
+#' @param duckdb_path
+#' @param path
+#'
+#' @returns
+#' @export
+#'
+#' @examples
+cleanData <- function(duckdb_path, path) {
+  duckdb_path <- normalizePath(duckdb_path)
+  # If no explicit path is provided (or a generic one), choose results/<bug>/ when
+  # the DuckDB lives under data/<bug>/, or else fall back to the DuckDB directory.
+  if (missing(path) || path %in% c(".", "results", "results/")) {
+    bug_dir <- dirname(duckdb_path)
+    mapped_results <- sub(
+      paste0(.Platform$file.sep, "data", .Platform$file.sep),
+      paste0(.Platform$file.sep, "results", .Platform$file.sep),
+      bug_dir,
+      fixed = TRUE
+    )
+    path <- if (!identical(mapped_results, bug_dir)) mapped_results else bug_dir
+  }
+
+  path <- normalizePath(path, mustWork = FALSE)
+  if (!dir.exists(path)) dir.create(path, recursive = TRUE)
+
+  con <- DBI::dbConnect(duckdb::duckdb(), duckdb_path)
+  on.exit(try(DBI::dbDisconnect(con, shutdown = FALSE), silent = TRUE), add = TRUE)
+
   # Parquet output paths
   genes_parquet <- file.path(path, "gene_count.parquet")
   gene_names_parquet <- file.path(path, "gene_names.parquet")
@@ -1321,17 +1471,11 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
   proteins_parquet <- file.path(path, "protein_count.parquet")
   domains_parquet <- file.path(path, "domain_count.parquet")
 
-  metadata_parquet <- file.path(path, "metadata.parquet") # cleaned_metadata exported as 'metadata'
-
   domain_names_parquet <- file.path(path, "domain_names.parquet")
   protein_names_parquet <- file.path(path, "protein_names.parquet")
 
   protein_cluster_seq_parquet <- file.path(path, "protein_seqs.parquet")
-
-  # Also export AMR/genome/original metadata
-  amr_phenotype_parquet <- file.path(path, "amr_phenotype.parquet")
-  genome_data_parquet <- file.path(path, "genome_data.parquet")
-  original_metadata_parquet <- file.path(path, "original_metadata.parquet")
+  protein_cluster_member_parquet <- file.path(path, "protein_members.parquet")
 
   writeCompressedParquet <- function(df, path) {
     arrow::write_parquet(
@@ -1385,10 +1529,6 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
     writeCompressedParquet(struct_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW struct AS SELECT * FROM read_parquet('%s')", basename(struct_parquet)))
 
-  # cleaned_metadata -> parquet + view (as metadata)
-  DBI::dbReadTable(con, "cleaned_metadata") |> writeCompressedParquet(metadata_parquet)
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW metadata AS SELECT * FROM read_parquet('%s')", basename(metadata_parquet)))
-
   # names/seq tables -> parquet + views
   DBI::dbReadTable(con, "gene_names") |> writeCompressedParquet(gene_names_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW gene_names AS SELECT * FROM read_parquet('%s')", basename(gene_names_parquet)))
@@ -1409,18 +1549,12 @@ cleanData <- function(duckdb_path, path, ref_file_path = "data_raw/") {
   DBI::dbReadTable(con, "protein_cluster_seq") |> writeCompressedParquet(protein_cluster_seq_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_seqs AS SELECT * FROM read_parquet('%s')", basename(protein_cluster_seq_parquet)))
 
+  DBI::dbReadTable(con, "protein_members") |> writeCompressedParquet(protein_cluster_member_parquet)
+  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW protein_members AS SELECT * FROM read_parquet('%s')", protein_cluster_member_parquet))
+
   DBI::dbReadTable(con, "genome_gene_protein") |> writeCompressedParquet(genome_gene_protein_parquet)
   DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_gene_protein AS SELECT * FROM read_parquet('%s')", basename(genome_gene_protein_parquet)))
 
-  # debug/complete views: amr_phenotype, genome_data, original_metadata
-  DBI::dbReadTable(con, "amr_phenotype") |> writeCompressedParquet(amr_phenotype_parquet)
-  DBI::dbReadTable(con, "genome_data") |> writeCompressedParquet(genome_data_parquet)
-  DBI::dbReadTable(con, "metadata") |> writeCompressedParquet(original_metadata_parquet)
-
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW amr_phenotype AS SELECT * FROM read_parquet('%s')", basename(amr_phenotype_parquet)))
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW genome_data AS SELECT * FROM read_parquet('%s')", basename(genome_data_parquet)))
-  DBI::dbExecute(con_new, sprintf("CREATE OR REPLACE VIEW original_metadata AS SELECT * FROM read_parquet('%s')", basename(original_metadata_parquet)))
-
   invisible(TRUE)
 }
 
@@ -1641,7 +1775,8 @@ runDataProcessing <- function(duckdb_path,
     stop("`ref_file_path` (directory with reference TSVs) must be provided to cleanData().")
   }
   if (isTRUE(verbose)) message("Cleaning metadata and exporting Parquet-backed views.")
-  cleanData(duckdb_path = duckdb_path, path = out_dir, ref_file_path = ref_file_path)
+  cleanMetaData(duckdb_path = duckdb_path, path = out_dir, ref_file_path = ref_file_path)
+  cleanData(duckdb_path = duckdb_path, path = out_dir)
 
   parquet_duckdb_path <- paste0(
     stringr::str_split_i(duckdb_path, ".duckdb", i = 1),
diff --git a/man/dot-audit_gaps.Rd b/man/dot-audit_gaps.Rd
new file mode 100644
index 0000000..86009c0
--- /dev/null
+++ b/man/dot-audit_gaps.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.audit_gaps}
+\alias{.audit_gaps}
+\title{Helps in auditing downloaded files to ensure everything's complete per ID}
+\usage{
+.audit_gaps(out_dir, ids, min_bytes = 100)
+}
+\description{
+Helps in auditing downloaded files to ensure everything's complete per ID
+}
+\keyword{internal}
diff --git a/man/dot-cli_dump_fastas_gto_chunk.Rd b/man/dot-cli_dump_fastas_gto_chunk.Rd
new file mode 100644
index 0000000..5d9327c
--- /dev/null
+++ b/man/dot-cli_dump_fastas_gto_chunk.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.cli_dump_fastas_gto_chunk}
+\alias{.cli_dump_fastas_gto_chunk}
+\title{Using p3-dump-genomes in CLI to fetch FASTA and .gto files}
+\usage{
+.cli_dump_fastas_gto_chunk(image, out_dir, genome_ids, tag, tries = 3L)
+}
+\description{
+Using p3-dump-genomes in CLI to fetch FASTA and .gto files
+}
+\keyword{internal}
diff --git a/man/dot-cli_export_gff_chunk.Rd b/man/dot-cli_export_gff_chunk.Rd
new file mode 100644
index 0000000..061da41
--- /dev/null
+++ b/man/dot-cli_export_gff_chunk.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.cli_export_gff_chunk}
+\alias{.cli_export_gff_chunk}
+\title{Exports GFF files from the downloaded GTO per genome in each chunk}
+\usage{
+.cli_export_gff_chunk(image, out_dir, genome_ids, tag, tries = 3L)
+}
+\description{
+Exports GFF files from the downloaded GTO per genome in each chunk
+}
+\keyword{internal}
diff --git a/man/dot-create_amr_tagged_view.Rd b/man/dot-create_amr_tagged_view.Rd
new file mode 100644
index 0000000..442c3bf
--- /dev/null
+++ b/man/dot-create_amr_tagged_view.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.create_amr_tagged_view}
+\alias{.create_amr_tagged_view}
+\title{Helps tag genomes with their AMR evidence for parsing}
+\usage{
+.create_amr_tagged_view(con)
+}
+\description{
+Helps tag genomes with their AMR evidence for parsing
+}
+\keyword{internal}
diff --git a/man/dot-docker_path.Rd b/man/dot-docker_path.Rd
index 8ea1e5a..d0b6f70 100644
--- a/man/dot-docker_path.Rd
+++ b/man/dot-docker_path.Rd
@@ -1,9 +1,11 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/data_processing.R
+% Please edit documentation in R/data_curation.R, R/data_processing.R
 \name{.docker_path}
 \alias{.docker_path}
-\title{Normalize a host filesystem path for use in Docker}
+\title{Helps normalize Docker paths}
 \usage{
+.docker_path(p)
+
 .docker_path(p)
 }
 \arguments{
diff --git a/man/dot-extractMembersInClusters.Rd b/man/dot-extractMembersInClusters.Rd
new file mode 100644
index 0000000..e45a8db
--- /dev/null
+++ b/man/dot-extractMembersInClusters.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_processing.R
+\name{.extractMembersInClusters}
+\alias{.extractMembersInClusters}
+\title{Parse CD-HIT \code{.clstr} output into a long-format mapping}
+\usage{
+.extractMembersInClusters(clustered_faa)
+}
+\arguments{
+\item{clustered_faa}{Base path to CD-HIT output (without \code{.clstr} extension).}
+}
+\value{
+A tibble with columns \code{cluster} and \code{member}.
+}
+\description{
+Reads a CD-HIT \code{.clstr} file and constructs a mapping of clusters to member feature ids.
+}
+\keyword{internal}
diff --git a/man/dot-filterGenomes.Rd b/man/dot-filterGenomes.Rd
index d6871df..eb31db4 100644
--- a/man/dot-filterGenomes.Rd
+++ b/man/dot-filterGenomes.Rd
@@ -7,11 +7,30 @@
 .filterGenomes(
   user_bacs,
   base_dir = ".",
+  evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
   verbose = TRUE,
   fallback_to_bvbrc_cache = TRUE
 )
 }
+\arguments{
+\item{evidence_mode}{Character. One of:
+"lab_only"   (default) -> only laboratory evidence
+"lab_or_comp"          -> laboratory OR computational evidence
+"comp_only"            -> only computational evidence
+"any"                  -> no AMR required (from genome_data; Good only)}
+}
 \value{
+A list with a DuckDB connection and table_name = "filtered"
+Filter genomes by AMR phenotype and metadata, and store results in DuckDB
+
+Preferred path: use per-selection DB "metadata" table (from retrieveMetadata()) and
+apply evidence & genome_quality filters.
+
+Fallback path: if "metadata" is missing and fallback_to_bvbrc_cache = TRUE,
+read BV-BRC cache at <base_dir>/data/bvbrc/bvbrcData.duckdb ("bvbrc_bac_data"),
+derive genome IDs from user_bacs (taxon IDs or species substring), and
+write a minimal "filtered" table (without AMR evidence filtering).
+
 A list with a DuckDB connection and table_name = "filtered"
 }
 \description{
diff --git a/man/dot-ftpes_download_one.Rd b/man/dot-ftpes_download_one.Rd
new file mode 100644
index 0000000..b89f5d4
--- /dev/null
+++ b/man/dot-ftpes_download_one.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.ftpes_download_one}
+\alias{.ftpes_download_one}
+\title{Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck
+when malformed files can hang an FTPS connection by introducing safeguards}
+\usage{
+.ftpes_download_one(
+  genomeID,
+  out_dir,
+  connect_timeout = 10L,
+  max_time = 30L,
+  speed_time = 30L,
+  speed_limit = 2048L,
+  min_bytes = 100L
+)
+}
+\description{
+Helps appropriately interface with BV-BRC FTPS server, and avoids getting stuck
+when malformed files can hang an FTPS connection by introducing safeguards
+}
+\keyword{internal}
diff --git a/man/dot-ftpes_download_two_pass.Rd b/man/dot-ftpes_download_two_pass.Rd
new file mode 100644
index 0000000..e15d299
--- /dev/null
+++ b/man/dot-ftpes_download_two_pass.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.ftpes_download_two_pass}
+\alias{.ftpes_download_two_pass}
+\title{Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and
+if that fails, trying a longer timeout 2nd pass at the end in case it was a
+hiccup. If 2nd pass fails, log and give up on that file.}
+\usage{
+.ftpes_download_two_pass(
+  genome_ids,
+  out_dir,
+  workers_first = 4L,
+  workers_second = 4L,
+  log_file = NULL
+)
+}
+\description{
+Helps manage FTPS downloading from BV-BRC, tryng a quick download first, and
+if that fails, trying a longer timeout 2nd pass at the end in case it was a
+hiccup. If 2nd pass fails, log and give up on that file.
+}
+\keyword{internal}
diff --git a/man/dot-generateDBname.Rd b/man/dot-generateDBname.Rd
index bebcf43..30ce51e 100644
--- a/man/dot-generateDBname.Rd
+++ b/man/dot-generateDBname.Rd
@@ -26,5 +26,4 @@ For numeric taxon IDs, it prefixes them with "tid_".
 .generateDBname(c("90371", "Bacillus subtilis"))
 .generateDBname(c("12345", "Escherichia coli", "Lactobacillus"))
 }
-
-}
+\keyword{internal}
diff --git a/man/dot-id_checker.Rd b/man/dot-id_checker.Rd
new file mode 100644
index 0000000..00fd8eb
--- /dev/null
+++ b/man/dot-id_checker.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.id_checker}
+\alias{.id_checker}
+\title{Helps ensure trailing 0s are retained in genome IDs for proper downloading}
+\usage{
+.id_checker(x)
+}
+\description{
+Helps ensure trailing 0s are retained in genome IDs for proper downloading
+}
+\keyword{internal}
diff --git a/man/dot-is_complete_set.Rd b/man/dot-is_complete_set.Rd
new file mode 100644
index 0000000..9bd4758
--- /dev/null
+++ b/man/dot-is_complete_set.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.is_complete_set}
+\alias{.is_complete_set}
+\title{Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff)}
+\usage{
+.is_complete_set(dir, genomeID, min_bytes = 100)
+}
+\description{
+Helps check if a complete set exists after DL (.fna + .PATRIC.faa + .PATRIC.gff)
+}
+\keyword{internal}
diff --git a/man/dot-list_complete.Rd b/man/dot-list_complete.Rd
new file mode 100644
index 0000000..f473700
--- /dev/null
+++ b/man/dot-list_complete.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.list_complete}
+\alias{.list_complete}
+\title{Helps collate completed genomes into a set}
+\usage{
+.list_complete(dir, genome_ids, min_bytes = 100)
+}
+\description{
+Helps collate completed genomes into a set
+}
+\keyword{internal}
diff --git a/man/dot-pick_shell.Rd b/man/dot-pick_shell.Rd
new file mode 100644
index 0000000..fb4e8bf
--- /dev/null
+++ b/man/dot-pick_shell.Rd
@@ -0,0 +1,12 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_curation.R
+\name{.pick_shell}
+\alias{.pick_shell}
+\title{Helps run a shell inside a container, and prefers bash (don't we all?)}
+\usage{
+.pick_shell(image)
+}
+\description{
+Helps run a shell inside a container, and prefers bash (don't we all?)
+}
+\keyword{internal}
diff --git a/man/dot-retrieveQueryIDs.Rd b/man/dot-retrieveQueryIDs.Rd
index 094dde9..e8fbcce 100644
--- a/man/dot-retrieveQueryIDs.Rd
+++ b/man/dot-retrieveQueryIDs.Rd
@@ -6,21 +6,11 @@
 \usage{
 .retrieveQueryIDs(base_dir = ".", user_bacs, overwrite = FALSE, verbose = TRUE)
 }
-\arguments{
-\item{base_dir}{Character. Project root directory. Default ".".}
-
-\item{user_bacs}{Character vector. Mixed inputs of taxon IDs and/or species names.}
-
-\item{overwrite}{Logical. If FALSE and the DuckDB already exists for this selection, abort. Default: FALSE.}
-
-\item{verbose}{Logical.}
-}
 \value{
-A numeric vector of distinct \code{genome.genome_id}, or NULL if none found.
+A character vector of distinct \code{genome.genome_id}, or NULL if none found.
 }
 \description{
-Resolves user-provided taxa to taxon IDs, queries BV-BRC per unique taxon ID,
-and returns distinct genome IDs. Uses a per-selection DuckDB located under:
-<base_dir>/data/<bug_dir>/\if{html}{\out{<abbrev>}}.duckdb
-BV-BRC column names are preserved.
+Resolves user-provided taxa to taxon IDs using the local BV-BRC cache, then
+returns distinct genome IDs (Good + WGS/Complete). Also writes a 'bac_data'
+table to the per-selection DuckDB with a cache snapshot of core fields.
 }
diff --git a/man/prepareGenomes.Rd b/man/prepareGenomes.Rd
index 204f3f6..a764058 100644
--- a/man/prepareGenomes.Rd
+++ b/man/prepareGenomes.Rd
@@ -9,6 +9,7 @@ prepareGenomes(
   base_dir = ".",
   method = c("ftp", "cli"),
   overwrite = FALSE,
+  evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
   verbose = TRUE
 )
 }
@@ -24,6 +25,10 @@ prepareGenomes(
 \item{overwrite}{Logical. Passed to metadata filtering and DuckDB creation.
 Default FALSE.}
 
+\item{evidence_mode}{Character. Sets what types of AMR evidence is acceptable.
+Default \code{lab_only}. \code{any} will not require AMR data for downloads. This will
+return very large download lists for many species!}
+
 \item{verbose}{Logical. Print progress messages. Default TRUE.}
 }
 \value{
diff --git a/man/retrieveGenomes.Rd b/man/retrieveGenomes.Rd
index 4006e94..1800b5d 100644
--- a/man/retrieveGenomes.Rd
+++ b/man/retrieveGenomes.Rd
@@ -10,10 +10,11 @@ retrieveGenomes(
   method = c("ftp", "cli"),
   image = "danylmb/bvbrc:5.3",
   skip_existing = TRUE,
-  ftp_workers = 8L,
+  ftp_workers = 4L,
   cli_fasta_workers = 4L,
   cli_gff_workers = 4L,
   chunk_size = 50L,
+  evidence_mode = c("lab_only", "lab_or_comp", "comp_only", "any"),
   verbose = TRUE
 )
 }
diff --git a/man/retrieveMetadata.Rd b/man/retrieveMetadata.Rd
index e450626..6c92a36 100644
--- a/man/retrieveMetadata.Rd
+++ b/man/retrieveMetadata.Rd
@@ -44,6 +44,6 @@ Tables written:
 \itemize{
 \item amr_phenotype
 \item genome_data
-\item metadata (inner join on genome ID field names as returned by BV-BRC)
+\item metadata (join on genome IDs returned by BV-BRC)
 }
 }