diff --git a/scripts/create_resources/spatial/process_10x_atera_nebius.sh b/scripts/create_resources/spatial/process_10x_atera_nebius.sh index baf016a08..d3475e586 100644 --- a/scripts/create_resources/spatial/process_10x_atera_nebius.sh +++ b/scripts/create_resources/spatial/process_10x_atera_nebius.sh @@ -14,7 +14,7 @@ cat > /tmp/params_atera.yaml << HERE param_list: - id: "10x_atera/2026_10x_human_breast_cancer_atera" - input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip + input: https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_outs.zip dataset_name: "Atera WTA FFPE Human Breast Cancer" dataset_url: "https://www.10xgenomics.com/datasets/atera-wta-ffpe-human-breast-cancer" dataset_summary: "Preview dataset showcasing the pre-commercial Atera Whole Transcriptome Assay (WTA) applied to FFPE human breast cancer tissue, profiling 18,028 genes and detecting 170,057 cells." diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 5976079de..533a0c7ff 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -251,6 +251,9 @@ def subsample_adata_group_balanced(adata, group_key, n_samples, seed=0): filter_table=True, ) rechunk_sdata(sdata_output) #NOTE: rechunking currently needed (https://github.com/scverse/spatialdata/issues/929) + # metadata is dataset-level, not spatial — re-add it if the bounding_box query dropped it + if "metadata" in sdata.tables and "metadata" not in sdata_output.tables: + sdata_output["metadata"] = sdata.tables["metadata"] else: sdata_output = sdata diff --git a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml index 5b43ab444..58148a028 100644 --- a/src/datasets/workflows/process_tenx_atera/config.vsh.yaml +++ b/src/datasets/workflows/process_tenx_atera/config.vsh.yaml @@ -82,4 +82,4 @@ dependencies: runners: - type: nextflow directives: - label: [highcpu, midmem, hightime] \ No newline at end of file + label: [highcpu, highmem, hightime] \ No newline at end of file diff --git a/src/methods_transcript_assignment/basic_transcript_assignment/script.py b/src/methods_transcript_assignment/basic_transcript_assignment/script.py index 0850c0f9a..e28fed31c 100644 --- a/src/methods_transcript_assignment/basic_transcript_assignment/script.py +++ b/src/methods_transcript_assignment/basic_transcript_assignment/script.py @@ -33,7 +33,16 @@ assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data." print('Transforming transcripts coordinates', flush=True) -transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system']) +# Parquet partitions each start from index 0, causing duplicate index values in the +# combined dask DataFrame. sd.transform() internally builds pd.Series(..., index=transformed.index) +# which fails with "cannot reindex on an axis with duplicate labels". +# Fix: reset to a global monotonic index before transforming; restore attrs explicitly +# because reset_index() drops them, which would break spatialdata's PointsModel check. +# The original sdata[transcripts_key] is left unchanged so lines below remain consistent. +transcripts_input = sdata[par['transcripts_key']] +transcripts_reset = transcripts_input.reset_index(drop=True) +transcripts_reset.attrs.update(transcripts_input.attrs) +transcripts = sd.transform(transcripts_reset, to_coordinate_system=par['coordinate_system']) # In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse() diff --git a/src/metrics/similarity/config.vsh.yaml b/src/metrics/similarity/config.vsh.yaml index 79faa93bb..7a8dde9bc 100644 --- a/src/metrics/similarity/config.vsh.yaml +++ b/src/metrics/similarity/config.vsh.yaml @@ -101,4 +101,4 @@ runners: # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: - label: [midtime, veryhighmem, midcpu] + label: [midtime, highmem, midcpu]