diff --git a/src/base/labels_nebius.config b/src/base/labels_nebius.config index 409ba948..809ec963 100644 --- a/src/base/labels_nebius.config +++ b/src/base/labels_nebius.config @@ -33,6 +33,9 @@ process { // Default disk space disk = 50.GB + // Always pull the latest image digest so nodes never serve a stale cached image + pod = [[imagePullPolicy: 'Always']] + // Retry for exit codes that have something to do with memory issues // always retry once errorStrategy = { exitStrat(task) } @@ -124,8 +127,8 @@ withLabel: veryhightime { time = 24.h } // similarity metric does not need veryhighmem resources withName: '.*similarity_process' { - memory = '50.GB' - disk = '50.GB' + memory = '100.GB' + disk = '100.GB' } } diff --git a/src/datasets/loaders/allen_brain_cell_atlas_merfish/script.py b/src/datasets/loaders/allen_brain_cell_atlas_merfish/script.py index 01afc383..01477528 100644 --- a/src/datasets/loaders/allen_brain_cell_atlas_merfish/script.py +++ b/src/datasets/loaders/allen_brain_cell_atlas_merfish/script.py @@ -227,6 +227,7 @@ def delete_dax_files(): raw_adata = anndata.read_h5ad(matrix_path) sample_adata = raw_adata[raw_adata.obs["sample_id"] == sample_id].copy() +del raw_adata fov_df = sample_adata.obs[["fov", "fov_x", "fov_y"]].drop_duplicates().copy() n_fovs = int(sample_adata.obs["fov"].max()) @@ -274,7 +275,9 @@ def um_to_px_y(y): 0.00001, compute_weight([xi, yi], [FRAME_SIZE_PIXELS, FRAME_SIZE_PIXELS], PERCENT_SCALING) ) -complete_img = np.zeros((n_z_planes, img_w, img_h), dtype="uint16") +# Store only the max-projection (MIP) rather than all z-planes to avoid a +# (n_z × H × W) array that can exceed 30+ GB for whole-brain datasets. +dapi_mip = np.zeros((img_w, img_h), dtype="uint16") for fov in range(n_fovs): if fov in missing_fovs: @@ -337,16 +340,16 @@ def um_to_px_y(y): ) # Skip overlap region already filled by a previously placed FOV - if np.any(complete_img[:, xp:xp_max, yp:yp + OVERLAP_SIZE] != 0): + if np.any(dapi_mip[xp:xp_max, yp:yp + OVERLAP_SIZE] != 0): fov_img = fov_img[:, :, OVERLAP_SIZE:] yp += OVERLAP_SIZE - if np.any(complete_img[:, xp:xp + OVERLAP_SIZE, yp:yp_max - OVERLAP_SIZE] != 0): + if np.any(dapi_mip[xp:xp + OVERLAP_SIZE, yp:yp_max - OVERLAP_SIZE] != 0): fov_img = fov_img[:, OVERLAP_SIZE:, :] xp += OVERLAP_SIZE - h = min(fov_img.shape[1], complete_img.shape[1] - xp) - w = min(fov_img.shape[2], complete_img.shape[2] - yp) - complete_img[:, xp:xp + h, yp:yp + w] = fov_img[:, :h, :w] + h = min(fov_img.shape[1], dapi_mip.shape[0] - xp) + w = min(fov_img.shape[2], dapi_mip.shape[1] - yp) + np.maximum(dapi_mip[xp:xp + h, yp:yp + w], fov_img[:, :h, :w].max(axis=0), out=dapi_mip[xp:xp + h, yp:yp + w]) if fov % 10 == 0: delete_dax_files() @@ -354,9 +357,6 @@ def um_to_px_y(y): print(datetime.now() - t0, "Image stitching complete", flush=True) -dapi_mip = complete_img.max(axis=0) # (img_w, img_h) max-projection over z -del complete_img - # ─── Step 4: Load and pixel-register transcripts ────────────────────────────── print(datetime.now() - t0, "Loading transcripts...", flush=True) diff --git a/src/datasets/loaders/tenx_atera/script.py b/src/datasets/loaders/tenx_atera/script.py index f2be855b..23a4e891 100644 --- a/src/datasets/loaders/tenx_atera/script.py +++ b/src/datasets/loaders/tenx_atera/script.py @@ -4,6 +4,7 @@ import zipfile import tempfile from pathlib import Path +import zarr from spatialdata_io import xenium ## VIASH START @@ -97,6 +98,7 @@ def extract_zip(input_zip: Path, output_dir: Path, strip_root: bool = False): if os.path.exists(par["output"]): shutil.rmtree(par["output"]) +zarr.config.set({"array.rectilinear_chunks": True}) sdata.write(par["output"]) print("Done", flush=True) \ No newline at end of file diff --git a/src/methods_segmentation/custom_segmentation/script.py b/src/methods_segmentation/custom_segmentation/script.py index c108117f..d29cb9ab 100644 --- a/src/methods_segmentation/custom_segmentation/script.py +++ b/src/methods_segmentation/custom_segmentation/script.py @@ -2,6 +2,7 @@ import anndata as ad import os import shutil +import pandas as pd ## VIASH START par = { @@ -20,14 +21,18 @@ assert par["labels_key"] in sdata.labels, f"Key '{par['labels_key']}' not found in input data." print(f"Copy segmentation from '{par['labels_key']}'", flush=True) +metadata = sdata.tables["metadata"] +# Select only the columns that exist — Xenium provides cell_id and region, +# Vizgen uses different column names (or an empty obs) so we take what's available. +obs_cols = [c for c in ["cell_id", "region"] if c in metadata.obs.columns] sdata_segmentation_only = sd.SpatialData( labels={ "segmentation": sdata[par["labels_key"]] }, tables={ "table": ad.AnnData( - obs=sdata.tables["metadata"].obs[["cell_id", "region"]], - var=sdata.tables["metadata"].var[[]] + obs=metadata.obs[obs_cols], + var=metadata.var[[]] ) } ) diff --git a/src/methods_segmentation/stardist/config.vsh.yaml b/src/methods_segmentation/stardist/config.vsh.yaml index 5b5207c0..b8a6e7dc 100644 --- a/src/methods_segmentation/stardist/config.vsh.yaml +++ b/src/methods_segmentation/stardist/config.vsh.yaml @@ -34,6 +34,7 @@ engines: - stardist - tensorflow==2.17.0 - numpy<2.0.0 + - scipy<1.15.0 - type: native runners: diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py index e28fed31..20377b8a 100644 --- a/src/methods_transcript_assignment/basic_transcript_assignment/script.py +++ b/src/methods_transcript_assignment/basic_transcript_assignment/script.py @@ -1,6 +1,7 @@ import numpy as np import xarray as xr import dask +import dask.dataframe as dd import spatialdata as sd import anndata as ad import pandas as pd @@ -33,14 +34,14 @@ assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." print('Transforming transcripts coordinates', flush=True) -# Parquet partitions each start from index 0, causing duplicate index values in the -# combined dask DataFrame. sd.transform() internally builds pd.Series(..., index=transformed.index) -# which fails with "cannot reindex on an axis with duplicate labels". -# Fix: reset to a global monotonic index before transforming; restore attrs explicitly -# because reset_index() drops them, which would break spatialdata's PointsModel check. +# Multi-partition parquet files each start with a 0-based index, producing duplicate index +# values in the combined dask DataFrame. sd.transform() internally creates a pd.Series with +# index=transformed.index; when that dask index is computed it triggers an assign expression +# that fails on duplicate/lazy indices. Fix: materialize to pandas and rebuild as a single +# dask partition with a clean RangeIndex before transforming. # The original sdata[transcripts_key] is left unchanged so lines below remain consistent. transcripts_input = sdata[par['transcripts_key']] -transcripts_reset = transcripts_input.reset_index(drop=True) +transcripts_reset = dd.from_pandas(transcripts_input.compute().reset_index(drop=True), npartitions=1) transcripts_reset.attrs.update(transcripts_input.attrs) transcripts = sd.transform(transcripts_reset, to_coordinate_system=par['coordinate_system']) @@ -55,16 +56,14 @@ label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() else: label_image = sdata_segm["segmentation"].to_numpy() -cell_id_dask_series = dask.dataframe.from_dask_array( - dask.array.from_array( - label_image[y_coords, x_coords], chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute()) - ), - index=sdata[par['transcripts_key']].index -) -sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series +# Assign cell ids directly to transcripts_reset (clean-index single-partition dask DataFrame). +# Using sdata[par['transcripts_key']] here would reintroduce the duplicate parquet index, +# causing the same "cannot reindex on an axis with duplicate labels" error at write time. +cell_ids = label_image[y_coords, x_coords] +transcripts_reset["cell_id"] = pd.Series(cell_ids) #create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id') -unique_cells = np.unique(cell_id_dask_series) +unique_cells = np.unique(cell_ids) # check if a '0' (noise/background) cell is in cell_id and remove zero_idx = np.where(unique_cells == 0) @@ -82,7 +81,7 @@ print('Subsetting to transcripts cell id data', flush=True) sdata_transcripts_only = sd.SpatialData( points={ - "transcripts": sdata[par['transcripts_key']] + "transcripts": transcripts_reset }, tables={ "table": ad.AnnData(