From 4f6836adeec526de58e33d8bbcfb7ae5df41f9d8 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 00:14:28 +0200 Subject: [PATCH 1/4] troubleshoot loaders --- .../spatial/process_bruker_cosmx.sh | 1 - .../spatial/process_bruker_cosmx_nebius.sh | 1 - .../loaders/bruker_cosmx/config.vsh.yaml | 3 +- src/datasets/loaders/bruker_cosmx/script.py | 27 ++-- src/datasets/loaders/tenx_atera/script.py | 133 ++++++++++-------- 5 files changed, 92 insertions(+), 73 deletions(-) diff --git a/scripts/create_resources/spatial/process_bruker_cosmx.sh b/scripts/create_resources/spatial/process_bruker_cosmx.sh index c94164829..7e4a7ea29 100644 --- a/scripts/create_resources/spatial/process_bruker_cosmx.sh +++ b/scripts/create_resources/spatial/process_bruker_cosmx.sh @@ -25,7 +25,6 @@ param_list: - id: "bruker_cosmx/bruker_human_liver_cosmx" input_raw: "https://smi-public.objects.liquidweb.services/NormalLiverFiles.zip" - input_flat_files: "https://syncandshare.desy.de/index.php/s/zYT4fM28y86cZeW/download/NormalLiver.zip" dataset_name: "Bruker CosMx Human Liver" dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-liver-rna-ffpe-dataset/" dataset_summary: "Bruker CosMx Human Liver dataset on FFPE." diff --git a/scripts/create_resources/spatial/process_bruker_cosmx_nebius.sh b/scripts/create_resources/spatial/process_bruker_cosmx_nebius.sh index f826be96b..9c00dfcb7 100644 --- a/scripts/create_resources/spatial/process_bruker_cosmx_nebius.sh +++ b/scripts/create_resources/spatial/process_bruker_cosmx_nebius.sh @@ -25,7 +25,6 @@ param_list: - id: "bruker_cosmx/bruker_human_liver_cosmx" input_raw: "https://smi-public.objects.liquidweb.services/NormalLiverFiles.zip" - input_flat_files: "https://syncandshare.desy.de/index.php/s/zYT4fM28y86cZeW/download/NormalLiver.zip" dataset_name: "Bruker CosMx Human Liver" dataset_url: "https://nanostring.com/products/cosmx-spatial-molecular-imager/ffpe-dataset/human-liver-rna-ffpe-dataset/" dataset_summary: "Bruker CosMx Human Liver dataset on FFPE." diff --git a/src/datasets/loaders/bruker_cosmx/config.vsh.yaml b/src/datasets/loaders/bruker_cosmx/config.vsh.yaml index d3f88e84a..0ed1a6bc5 100644 --- a/src/datasets/loaders/bruker_cosmx/config.vsh.yaml +++ b/src/datasets/loaders/bruker_cosmx/config.vsh.yaml @@ -11,7 +11,8 @@ argument_groups: - type: file name: --input_flat_files example: "https://smi-public.objects.liquidweb.services/Half%20%20Brain%20simple%20%20files%20.zip" - description: "Download file url for the flat files" + description: "Download file url for the flat files. Optional: only needed when flat files are not already present in the raw zip (e.g. mouse brain dataset)." + required: false - type: string name: --segmentation_id default: ["cell"] diff --git a/src/datasets/loaders/bruker_cosmx/script.py b/src/datasets/loaders/bruker_cosmx/script.py index e959950a0..00b2e3f64 100644 --- a/src/datasets/loaders/bruker_cosmx/script.py +++ b/src/datasets/loaders/bruker_cosmx/script.py @@ -167,18 +167,21 @@ def extract_zip(input_zip: Path, output_dir: Path, strip_root: bool = False): f"Contents of {INPUT_RAW_EXTRACTED}: {os.listdir(INPUT_RAW_EXTRACTED)}" ) -log("Extract zip of flat files") -INPUT_FLAT_FILES_EXTRACTED = TMP_DIR / "input_flat_files" -extract_zip(par["input_flat_files"], INPUT_FLAT_FILES_EXTRACTED, strip_root=True) - -log("Symlink csvs from flat files to data dir") -for path in INPUT_FLAT_FILES_EXTRACTED.glob("*.csv"): - target = DATA_DIR / path.name - if not target.exists(): - log(f"Symlink file {path.name} to {DATA_DIR}") - os.symlink(path.resolve(), target) - else: - log(f"File {path.name} already present in {DATA_DIR}") +if par["input_flat_files"]: + log("Extract zip of flat files") + INPUT_FLAT_FILES_EXTRACTED = TMP_DIR / "input_flat_files" + extract_zip(par["input_flat_files"], INPUT_FLAT_FILES_EXTRACTED, strip_root=True) + + log("Symlink csvs from flat files to data dir") + for path in INPUT_FLAT_FILES_EXTRACTED.glob("*.csv"): + target = DATA_DIR / path.name + if not target.exists(): + log(f"Symlink file {path.name} to {DATA_DIR}") + os.symlink(path.resolve(), target) + else: + log(f"File {path.name} already present in {DATA_DIR}") +else: + log("No flat files zip provided; assuming flat files are already present in the raw zip") # sopa expects a CellLabels/ folder, but some CosMx exports only ship the per-FOV # label tifs (inside FOV* folders). When that's the case, gather them into a diff --git a/src/datasets/loaders/tenx_atera/script.py b/src/datasets/loaders/tenx_atera/script.py index 9aa4b4ed3..f2be855b7 100644 --- a/src/datasets/loaders/tenx_atera/script.py +++ b/src/datasets/loaders/tenx_atera/script.py @@ -1,15 +1,14 @@ ## code author: Florian Heyl -import spatialdata as sd -import anndata as ad -from spatialdata_io import xenium import shutil import os import zipfile import tempfile +from pathlib import Path +from spatialdata_io import xenium ## VIASH START par = { - "input": "https://s3-us-west-2.amazonaws.com/10x.files/samples/atera/dev/WTA_Preview_FFPE_Breast_Cancer/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip", + "input": "temp/datasets/10x_atera/WTA_Preview_FFPE_Breast_Cancer_xe_outs.zip", "segmentation_id": [ "cell", "nucleus", @@ -25,61 +24,79 @@ } meta = { "cpus": 1, + "temp_dir": None, } ## VIASH END -# Download the data if it's a download url, extract the data if it's a zip file -par_input = par["input"] -with tempfile.TemporaryDirectory() as tmpdirname: - if par_input.startswith("http"): - print(f"Downloading data to {tmpdirname}", flush=True) - file_name = par_input.split("/")[-1] - os.system(f"wget {par['input']} -O {tmpdirname}/{file_name}") - par_input = tmpdirname + "/" + file_name - - if zipfile.is_zipfile(par_input): - print(f"Extracting input zip to {tmpdirname}", flush=True) - with zipfile.ZipFile(par_input, "r") as zip_ref: - zip_ref.extractall(tmpdirname) - # find the directory containing the Xenium output files (may be nested) - par_input = tmpdirname - for root, dirs, files in os.walk(tmpdirname): - if "cell_feature_matrix.h5" in files: - par_input = root - break - - # read the data - sdata = xenium( - path=par_input, - n_jobs=meta["cpus"] or 1, - cells_boundaries=True, - nucleus_boundaries=True, - morphology_focus=True, - cells_as_circles=False, - ) - - # remove morphology_focus - _ = sdata.images.pop("morphology_focus") - - print("Add uns to table", flush=True) - new_uns = { - "dataset_id": par["dataset_id"], - "dataset_name": par["dataset_name"], - "dataset_url": par["dataset_url"], - "dataset_reference": par["dataset_reference"], - "dataset_summary": par["dataset_summary"], - "dataset_description": par["dataset_description"], - "dataset_organism": par["dataset_organism"], - "segmentation_id": par["segmentation_id"], - } - for key, value in new_uns.items(): - sdata.tables["table"].uns[key] = value - - print(f"Output: {sdata}", flush=True) - - print(f"Writing to '{par['output']}'", flush=True) - if os.path.exists(par["output"]): - shutil.rmtree(par["output"]) - - sdata.write(par["output"]) \ No newline at end of file + +def extract_zip(input_zip: Path, output_dir: Path, strip_root: bool = False): + output_dir = Path(output_dir) + with zipfile.ZipFile(input_zip, 'r') as zip_ref: + members = zip_ref.infolist() + + roots = {Path(m.filename).parts[0] for m in members if m.filename.strip("/") and not m.filename.startswith("__MACOSX/")} + if not (strip_root and len(roots) == 1): + zip_ref.extractall(output_dir) + return + + for member in members: + if member.filename.startswith("__MACOSX/"): + continue + parts = Path(member.filename).parts[1:] + if not parts: + continue + target = output_dir.joinpath(*parts) + if member.is_dir(): + target.mkdir(parents=True, exist_ok=True) + else: + target.parent.mkdir(parents=True, exist_ok=True) + with zip_ref.open(member) as src, open(target, "wb") as dst: + shutil.copyfileobj(src, dst) + + +TMP_DIR = Path(meta["temp_dir"] or tempfile.mkdtemp()) +TMP_DIR.mkdir(parents=True, exist_ok=True) + +print("Extract input zip", flush=True) +input_extracted = TMP_DIR / "input" +extract_zip(Path(par["input"]), input_extracted, strip_root=True) + +print(f"Files in extracted dir: {os.listdir(input_extracted)}", flush=True) + +# read the data +sdata = xenium( + path=input_extracted, + n_jobs=meta["cpus"] or 1, + cells_boundaries=True, + nucleus_boundaries=True, + morphology_focus=True, + cells_as_circles=False, +) + +# remove morphology_focus +_ = sdata.images.pop("morphology_focus") + +print("Add uns to table", flush=True) +new_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_reference": par["dataset_reference"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_organism": par["dataset_organism"], + "segmentation_id": par["segmentation_id"], +} +for key, value in new_uns.items(): + sdata.tables["table"].uns[key] = value + +print(f"Output: {sdata}", flush=True) + +print(f"Writing to '{par['output']}'", flush=True) +if os.path.exists(par["output"]): + shutil.rmtree(par["output"]) + +sdata.write(par["output"]) + +print("Done", flush=True) \ No newline at end of file From db9715f7d248a0d4583f2eed415f4123ea001a1b Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 00:14:56 +0200 Subject: [PATCH 2/4] adjust mem --- src/base/labels_nebius.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/labels_nebius.config b/src/base/labels_nebius.config index f670e1b61..0a6e38918 100644 --- a/src/base/labels_nebius.config +++ b/src/base/labels_nebius.config @@ -74,7 +74,7 @@ process { // Nebius gpu-node-group: NVIDIA L40S, 1 GPU, 8 CPUs, 32 GiB RAM cpus = 6 accelerator = 1 - memory = 100.GB + memory = 32.GB disk = 200.GB pod = [[nodeSelector: 'nebius.com/node-group-id=mk8snodegroup-e00t775jb99svb7k5r']] containerOptions = { workflow.containerEngine == "singularity" ? '--nv': From 80cd60097cbc22b4563560dd18421907302c5bbf Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 01:04:33 +0200 Subject: [PATCH 3/4] fix memory attempt 2 --- src/base/labels_nebius.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/labels_nebius.config b/src/base/labels_nebius.config index 0a6e38918..409ba948d 100644 --- a/src/base/labels_nebius.config +++ b/src/base/labels_nebius.config @@ -74,7 +74,7 @@ process { // Nebius gpu-node-group: NVIDIA L40S, 1 GPU, 8 CPUs, 32 GiB RAM cpus = 6 accelerator = 1 - memory = 32.GB + memory = 28.GB disk = 200.GB pod = [[nodeSelector: 'nebius.com/node-group-id=mk8snodegroup-e00t775jb99svb7k5r']] containerOptions = { workflow.containerEngine == "singularity" ? '--nv': From a3a78184bcf078287ed9ce5eb85423dd628f86d3 Mon Sep 17 00:00:00 2001 From: dariarom94 Date: Tue, 23 Jun 2026 13:09:05 +0200 Subject: [PATCH 4/4] adjust config to fit atera test requirements --- src/datasets/loaders/tenx_atera/config.vsh.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/loaders/tenx_atera/config.vsh.yaml b/src/datasets/loaders/tenx_atera/config.vsh.yaml index 25ed630fe..6da3a43bf 100644 --- a/src/datasets/loaders/tenx_atera/config.vsh.yaml +++ b/src/datasets/loaders/tenx_atera/config.vsh.yaml @@ -60,7 +60,8 @@ engines: setup: - type: python pypi: - - spatialdata-io + - spatialdata-io==0.6.0 + - spatialdata==0.7.2 - type: native runners: