diff --git a/app.py b/app.py index 194d857..e5e4d0e 100644 --- a/app.py +++ b/app.py @@ -23,12 +23,18 @@ st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="πŸ“ˆ"), st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="🎯"), st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="πŸ“‹"), + st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="πŸ“š"), + st.Page(Path("content", "enrichment.py"), title="Pathway Analysis", icon="πŸ“‰"), + ], + "Differential Protein Analysis": [ + st.Page(Path("content", "filtering.py"), title="Filtering", icon="🧹"), + st.Page(Path("content", "imputation.py"), title="Imputation", icon="🩹"), + st.Page(Path("content", "normalization.py"), title="Normalization", icon="βš–οΈ"), + st.Page(Path("content", "statistical.py"), title="Statistical", icon="πŸ”’"), st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="πŸŒ‹"), st.Page(Path("content", "results_pca.py"), title="PCA", icon="πŸ“Š"), st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="πŸ”₯"), - st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="πŸ“š"), - st.Page(Path("content", "results_proteomicslfq.py"), title="Proteomics LFQ", icon="πŸ§ͺ"), - ], + ] } pg = st.navigation(pages) diff --git a/content/enrichment.py b/content/enrichment.py new file mode 100644 index 0000000..d8c5f51 --- /dev/null +++ b/content/enrichment.py @@ -0,0 +1,140 @@ +"""Pathway Analysis Page.""" + +from pathlib import Path +import pandas as pd +import polars as pl +import streamlit as st +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data +# Import GO Enrichment modules from openms_insight engine +from openms_insight.analysis.enrichment import calculate_go_enrichment + +params = page_setup() +st.title("GO Enrichment Analysis") + +st.markdown( + """ +Identify overrepresented biological themes (BP, CC, MF) within your differentially expressed protein features using MyGene.info and Fisher's Exact Test. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# --- STEP 1: Upstream Statistics Checkpoint --- +if ( + "statistics_df" in st.session_state + and st.session_state["statistics_df"] is not None +): + final_statistics_report = st.session_state["statistics_df"] + st.info( + "πŸ”„ **Upstream Pipeline Detected**: Using analyzed matrices from the **Statistical Inference** step." + ) +else: + st.warning( + "⚠️ **Missing Prerequisites**: Statistical inference data not detected. Please run hypothesis testing first." + ) + st.page_link( + "content/results_statistics.py", label="Go to Statistical Inference", icon="πŸ”¬" + ) + st.stop() + +# --- STEP 2: Preprocessing Mapping Key Configuration --- +# Identify target identifier columns dynamically +id_col = "ProteinName" +if id_col not in final_statistics_report.columns: + st.error(f"❌ Structural Error: Column '{id_col}' is missing from the active matrix context.") + st.stop() + +# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels --- +st.subheader("Configure Enrichment Thresholds") + +# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj') +target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value" +p_label = ( + "Adjusted P-value (p-adj) Cutoff" + if target_p_col == "p-adj" + else "Raw P-value (p-value) Cutoff" +) + +ui_go_col1, ui_go_col2 = st.columns(2) + +with ui_go_col1: + p_cutoff = st.number_input( + f"πŸ”¬ {p_label}", + min_value=0.0001, + max_value=1.0, + value=0.05, + step=0.01, + format="%.4f", + help="Proteins with significance metrics below this value are mapped to the foreground cohort.", + ) + +with ui_go_col2: + fc_cutoff = st.number_input( + "πŸ“ˆ Absolute Difference Cutoff (|log2FC|)", + min_value=0.0, + max_value=10.0, + value=1.0, + step=0.1, + format="%.2f", + help="Proteins with absolute log2 fold change greater than or equal to this threshold will be selected.", + ) + +# --- SECTION 2: Execution and Interactive View Charts --- +st.markdown("
", unsafe_allow_html=True) +if st.button("πŸš€ Run GO Enrichment Analysis", type="primary", key="run_go_analysis"): + + with st.spinner("Querying MyGene.info API & executing hyper-geometric calculation loops..."): + # Convert internal pandas DataFrame to openms_insight Polars DataFrame expectation + stats_pl = pl.from_pandas(final_statistics_report) + + status, output = calculate_go_enrichment( + final_report=stats_pl, + id_col=id_col, + target_p_col=target_p_col, + p_cutoff=p_cutoff, + fc_cutoff=fc_cutoff, + ) + + # Route response structures based on analysis output status code + if status == "empty_data": + st.error("❌ No valid statistical rows found containing standard columns to run GO alignment.") + + elif status == "insufficient_proteins": + st.warning( + f"⚠️ Not enough significant proteins found to construct target datasets. " + f"(Criteria: {target_p_col} < {p_cutoff:.4f}, |log2FC| β‰₯ {fc_cutoff:.2f})." + ) + st.info(f"πŸ’‘ Found significant proteins count: **{output}**. Try relaxing your p-value or log2FC filters.") + + elif status == "success": + st.success("β­• GO Enrichment Analysis completed successfully!") + + # Display operational matrix scale + st.markdown( + f"πŸ“Š **Analysis Profile Scope**: Mapped **{output['fg_count']}** significant foreground profiles out of **{output['bg_count']}** reference background items." + ) + + # Build multi-tab interface layer for ontology subcategories + tabs = st.tabs([ + "🧬 Biological Process (BP)", + "πŸ”¬ Cellular Component (CC)", + "πŸ§ͺ Molecular Function (MF)" + ]) + categories_data = output["categories"] + + for idx, go_type in enumerate(["BP", "CC", "MF"]): + with tabs[idx]: + fig = categories_data[go_type]["fig"] + df_go = categories_data[go_type]["df"] + + if fig is not None and df_go is not None: + # Render plotly bar figures generated straight from backend engine + st.plotly_chart(fig, use_container_width=True) + + st.subheader(f"πŸ“Š {go_type} Results Dataframe") + st.dataframe(df_go, use_container_width=True) + else: + st.info(f"No statistically overrepresented terms identified for Category: **{go_type}**") \ No newline at end of file diff --git a/content/filtering.py b/content/filtering.py new file mode 100644 index 0000000..402c29b --- /dev/null +++ b/content/filtering.py @@ -0,0 +1,163 @@ +"""Filtering Page.""" + +from pathlib import Path +import pandas as pd +import polars as pl +import streamlit as st +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +# Import filtering functions from openms_insight package +from openms_insight.analysis.filter import ( + filter_low_abundance, + filter_low_repeatability, + filter_low_variance, +) + +params = page_setup() +st.title("Data Filtering") + +st.markdown( + """ +Filter out low-quality proteins from your dataset based on abundance, repeatability, or variance thresholds. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info( + "Abundance data not available. Please run the workflow and configure sample groups first." + ) + st.page_link( + "content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹" + ) + st.stop() + +pivot_df, group_map = result + +# 1. Identify actual sample columns dynamically +sample_cols = [ + c + for c in pivot_df.columns + if c not in ["ProteinName", "PeptideSequence"] +] + +# --- SECTION 1: Original Data View --- +st.subheader("Original Abundance Table") +st.markdown( + f"Currently displaying **{pivot_df.shape[0]}** proteins and **{len(sample_cols)}** samples before filtering." +) +st.dataframe(pivot_df, use_container_width=True) + +st.markdown("---") + +# --- SECTION 2: Filter Configuration --- +st.subheader("Configure Filter Engine") + +# Prepare Polars Metadata DataFrame required by openms_insight functions +metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols] +metadata_pl = pl.DataFrame( + metadata_rows, schema={"sample_id": pl.String, "group": pl.String} +) + +# User selection for filtering strategy +filter_method = st.selectbox( + "Select Filtering Method", + options=["Low Abundance", "Low Repeatability", "Low Variance"], + index=0, + help="Choose the statistical criteria to prune unreliable protein entries.", +) + +# Render threshold sliders dynamically based on the selected filter method +if filter_method == "Low Abundance": + st.markdown( + "**Low Abundance Filter**: Keeps rows where at least one group's median is above the selected percentile threshold." + ) + threshold = st.slider( + "Threshold Percentile (%)", + min_value=0.0, + max_value=100.0, + value=10.0, + step=5.0, + ) + +elif filter_method == "Low Repeatability": + st.markdown( + "**Low Repeatability Filter**: Keeps rows where at least one group has a missing value ratio within the allowed maximum." + ) + threshold = st.slider( + "Max Missing Ratio", + min_value=0.0, + max_value=100.0, + value=50.0, + step=5.0, + help="Allowed missing value (zero or null) ratio per group.", + ) + +elif filter_method == "Low Variance": + st.markdown( + "**Low Variance Filter**: Keeps rows where at least one group's variance is above the selected percentile threshold." + ) + threshold = st.slider( + "Threshold Percentile (%)", + min_value=0.0, + max_value=100.0, + value=10.0, + step=5.0, + ) + +# --- SECTION 3: Filter Execution and Collected Results View --- +if st.button("Apply Filter", type="primary"): + # Convert the original Pandas DataFrame into a Polars LazyFrame graph + quant_lazy = pl.from_pandas(pivot_df).lazy() + + # Route execution to the chosen openms_insight engine function + if filter_method == "Low Abundance": + filtered_lazy = filter_low_abundance( + quantification_data=quant_lazy, + metadata=metadata_pl, + group_column="group", + threshold_percentile=threshold, + ) + elif filter_method == "Low Repeatability": + # Convert percent slider input to ratio expected by the function (e.g., 50.0% -> 0.5) + filtered_lazy = filter_low_repeatability( + quantification_data=quant_lazy, + metadata=metadata_pl, + group_column="group", + max_missing_ratio=threshold / 100.0, + ) + elif filter_method == "Low Variance": + filtered_lazy = filter_low_variance( + quantification_data=quant_lazy, + metadata=metadata_pl, + group_column="group", + threshold_percentile=threshold, + ) + + # Collect the evaluated lazy graph and convert back to Pandas for visualization + filtered_df = filtered_lazy.collect().to_pandas() + st.session_state["filtered_df"] = filtered_df + + # Layout response metrics and the filtered matrix + st.success(f"Successfully applied **{filter_method}** filter!") + + # Display dataset scale compression stats + col1, col2, col3 = st.columns(3) + col1.metric("Original Proteins", pivot_df.shape[0]) + col2.metric("Filtered Proteins", filtered_df.shape[0]) + col3.metric( + "Removed Proteins", pivot_df.shape[0] - filtered_df.shape[0], delta=None + ) + + st.subheader("Filtered Abundance Table") + if filtered_df.empty: + st.warning( + "The filtered table is empty. Try relaxing the threshold constraints." + ) + else: + st.dataframe(filtered_df, use_container_width=True) \ No newline at end of file diff --git a/content/imputation.py b/content/imputation.py new file mode 100644 index 0000000..1cbbb5e --- /dev/null +++ b/content/imputation.py @@ -0,0 +1,134 @@ +"""Imputation Page.""" + +from pathlib import Path +import pandas as pd +import polars as pl +import streamlit as st +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data + +# Import imputation algorithms from openms_insight engine +from openms_insight.analysis.imputation import impute_mar, impute_smallest_value + +params = page_setup() +st.title("Missing Value Imputation") + +st.markdown( + """ +Handle missing values (zeros or nulls) in your quantification matrix using biological group-aware (MAR) or absolute lowest limit (MNAR) techniques. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# Load base dataset and clean dictionary keys +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info( + "Abundance data not available. Please run the workflow and configure sample groups first." + ) + st.page_link( + "content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹" + ) + st.stop() + +pivot_df, group_map = result + +# 1. Pipeline Checkpoint: Fetch upstream filtered data if available, fallback to raw pivot matrix +if "filtered_df" in st.session_state and st.session_state["filtered_df"] is not None: + base_df = st.session_state["filtered_df"] + st.info( + "πŸ”„ **Upstream Pipeline Detected**: Using data processed from the **Filtering** step." + ) +else: + base_df = pivot_df + st.warning( + "⚠️ **Raw Input Active**: No filtering history found. Operating on the original unfiltered table." + ) + +# 2. Identify actual sample columns dynamically based on the current active matrix +sample_cols = [ + c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"] +] + +# --- SECTION 1: Input Matrix Summary --- +st.subheader("Input Matrix Overview") +st.markdown( + f"Currently analyzing **{base_df.shape[0]}** rows across **{len(sample_cols)}** samples before imputation." +) +st.dataframe(base_df, use_container_width=True) + +st.markdown("---") + +# --- SECTION 2: Imputation Configuration --- +st.subheader("Configure Imputation Engine") + +# Build Polars structural metadata DataFrame +metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols] +metadata_pl = pl.DataFrame( + metadata_rows, schema={"sample_id": pl.String, "group": pl.String} +) + +# User selection for core missingness assumption strategy +impute_category = st.selectbox( + "Select Imputation Class", + options=["MAR (Missing At Random)", "MNAR (Missing Not At Random)"], + index=0, + help="MAR uses group metrics (Mean/Median). MNAR shifts values below the limit of detection.", +) + +# Render algorithmic options sub-menus based on the parent selection +if impute_category == "MAR (Missing At Random)": + st.markdown( + "**Group Character Imputation**: Fills missing metrics leveraging sample properties belonging to the same group." + ) + strategy_opt = st.radio( + "Mathematical Strategy", + options=["median", "mean"], + index=0, + horizontal=True, + ) + +elif impute_category == "MNAR (Missing Not At Random)": + st.markdown( + "**Smallest Value Imputation**: Replaces missing items with the minimum values detected to reflect technical dropout limits." + ) + scope_opt = st.radio( + "Detection Minimum Scope", + options=["row", "global"], + index=0, + horizontal=True, + help="'row' targets current protein minimum; 'global' searches the entire mass spectrometry matrix profile.", + ) + +# --- SECTION 3: Imputation Execution --- +if st.button("Apply Imputation", type="primary"): + # Initialize optimization pipeline graph via lazy loading conversion + quant_lazy = pl.from_pandas(base_df).lazy() + + # Route configuration matrix parameters to designated engine function channels + if impute_category == "MAR (Missing At Random)": + imputed_lazy = impute_mar( + quantification_data=quant_lazy, + metadata=metadata_pl, + group_column="group", + strategy=strategy_opt, + ) + elif impute_category == "MNAR (Missing Not At Random)": + imputed_lazy = impute_smallest_value( + quantification_data=quant_lazy, metadata=metadata_pl, scope=scope_opt + ) + + # Resolve lazy graph optimization tree and push to display data frame structure + imputed_df = imputed_lazy.collect().to_pandas() + + # πŸ’Ύ Save current output into Session State for down-stream processing (Normalization, Statistics) + st.session_state["imputed_df"] = imputed_df + + st.success(f"Successfully finalized **{impute_category}** imputation step!") + + # Calculate and display a quick performance matrix check + st.subheader("Imputed Result Table") + st.dataframe(imputed_df, use_container_width=True) \ No newline at end of file diff --git a/content/normalization.py b/content/normalization.py new file mode 100644 index 0000000..a064598 --- /dev/null +++ b/content/normalization.py @@ -0,0 +1,182 @@ +"""Normalization Page.""" + +from pathlib import Path +import pandas as pd +import polars as pl +import streamlit as st +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data +# Import normalization engine functions from openms_insight +from openms_insight.analysis.normalization import ( + normalize_samples, + scale_data, + transform_data, +) + +params = page_setup() +st.title("Data Normalization & Scaling") + +st.markdown( + """ +Standardize and transform your protein abundance profiles to correct for technical variations and optimize statistical distributions. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# Load primary database assets +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info( + "Abundance data not available. Please run the workflow and configure sample groups first." + ) + st.page_link( + "content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹" + ) + st.stop() + +pivot_df, group_map = result + +# --- STEP 1: Upstream Pipeline Tracker (Fallback Architecture) --- +if ( + "imputed_df" in st.session_state + and st.session_state["imputed_df"] is not None +): + base_df = st.session_state["imputed_df"] + st.info( + "πŸ”„ **Upstream Pipeline Detected**: Using data processed from the **Imputation** step." + ) +elif ( + "filtered_df" in st.session_state + and st.session_state["filtered_df"] is not None +): + base_df = st.session_state["filtered_df"] + st.warning( + "⚠️ **Imputation Skipped**: Using data processed from the **Filtering** step." + ) +else: + base_df = pivot_df + st.warning( + "⚠️ **Raw Input Active**: No preprocessing history found. Operating on the original unfiltered table." + ) + +# 2. Extract actual active sample columns dynamically +sample_cols = [ + c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"] +] + +# --- SECTION 1: Active Input Table Preview --- +st.subheader("Input Table Overview") +st.markdown( + f"Currently displaying **{base_df.shape[0]}** rows and **{len(sample_cols)}** samples entering the normalization block." +) +st.dataframe(base_df, use_container_width=True) + +st.markdown("---") + +# --- SECTION 2: Normalization Parameter Configuration --- +st.subheader("Configure Preprocessing & Scaling Chains") + +# Prepare structural Polars metadata DataFrame required by backend functions +metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols] +metadata_pl = pl.DataFrame( + metadata_rows, schema={"sample_id": pl.String, "group": pl.String} +) + +col1, col2, col3 = st.columns(3) + +with col1: + st.markdown("### 🧬 1. Mathematical Transformation") + transform_strategy = st.selectbox( + "Select Transformation", + options=["None", "log2", "log10", "square_root", "cube_root"], + index=0, + help="Compress data dynamic range and stabilize heteroscedastic variance profiles.", + ) + +with col2: + st.markdown("### πŸ§ͺ 2. Sample Normalization") + norm_strategy = st.selectbox( + "Select Normalization", + options=["None", "sum", "median", "pqn", "reference_feature", "quantile"], + index=0, + help="Perform column-wise corrections to account for variable sample loading concentrations.", + ) + + # Conditionally display target input field for reference feature matching + ref_feature_input = None + if norm_strategy == "reference_feature": + ref_feature_input = st.text_input( + "Reference Protein Name (ID)", + value="", + placeholder="e.g., P01234 or GAPDH", + help="Enter the exact unique identifier string matching a key inside the 'ProteinName' column.", + ) + +with col3: + st.markdown("### πŸ“Š 3. Row Scaling") + scaling_strategy = st.selectbox( + "Select Scaling Mode", + options=["None", "mean_centering", "auto_scaling", "pareto_scaling", "range_scaling"], + index=0, + help="Adjust individual feature weights to make low and high abundance proteins comparable.", + ) + + +# --- SECTION 3: Normalization Pipe Sequential Execution --- +st.markdown("
", unsafe_allow_html=True) +if st.button("Apply Normalization Pipelines", type="primary"): + + # Validate reference feature selection if active before hitting polars execution layers + if norm_strategy == "reference_feature" and not ref_feature_input: + st.error( + "❌ Validation Error: Please provide a valid Reference Protein Name to use the 'reference_feature' strategy." + ) + st.stop() + + # Convert pandas memory buffer into optimization lazy dataframe tree graph + processing_lazy = pl.from_pandas(base_df).lazy() + + # Execute Chain 1: Transform Matrix Data + try: + processing_lazy = transform_data( + quantification_data=processing_lazy, + metadata=metadata_pl, + strategy=transform_strategy, + ) + + # Execute Chain 2: Normalize Sample Intensities (Columns) + processing_lazy = normalize_samples( + quantification_data=processing_lazy, + metadata=metadata_pl, + strategy=norm_strategy, + id_col="ProteinName", + reference_feature=ref_feature_input if norm_strategy == "reference_feature" else None, + ) + + # Execute Chain 3: Scale Individual Features (Rows) + processing_lazy = scale_data( + quantification_data=processing_lazy, + metadata=metadata_pl, + strategy=scaling_strategy, + ) + + # Finalize and collect pipeline query graph optimizations + normalized_df = processing_lazy.collect().to_pandas() + + # πŸ’Ύ Save processing checkpoint inside Session State for Downstream (Statistics Block) + st.session_state["normalized_df"] = normalized_df + + st.success("Successfully executed all selected normalization pipelines!") + + # Display the finalized transformation matrix view + st.subheader("Normalized Abundance Table") + st.dataframe(normalized_df, use_container_width=True) + + except ValueError as val_err: + # Gracefully handle validation failures raised from the engine layers (e.g., missing reference protein) + st.error(f"Engine Configuration Error: {str(val_err)}") + except Exception as e: + st.error(f"An unexpected pipeline error occurred: {str(e)}") \ No newline at end of file diff --git a/content/results_abundance.py b/content/results_abundance.py index a7ff453..b391fc2 100644 --- a/content/results_abundance.py +++ b/content/results_abundance.py @@ -59,25 +59,33 @@ st.page_link("content/workflow_configure.py", label="Go to Configure", icon="βš™οΈ") st.stop() - pivot_df, expr_df, group_map = result + pivot_df, group_map = result - # Display group comparison info - groups = sorted(set(group_map.values())) - if len(groups) >= 2: - group1, group2 = sorted(groups)[:2] - st.info(f"Statistical comparison: **{group2} vs {group1}**") + # st.write("------------ pivot_df -------------") + # st.write(pivot_df) - # Get sample columns (between stats and PeptideSequence) - sample_cols = [c for c in pivot_df.columns if c not in ["ProteinName", "log2FC", "p-value", "PeptideSequence"]] + # st.write("----------group_map-------------") + # st.write(group_map) + # 1. Dynamically extract actual sample columns, excluding ProteinName and PeptideSequence + sample_cols = [ + c + for c in pivot_df.columns + if c not in ["ProteinName", "PeptideSequence"] + ] + + # 2. Combine values from sample columns to create an 'Intensity' list column for the bar chart pivot_df["Intensity"] = pivot_df[sample_cols].apply(list, axis=1) - # Reorder columns: place Intensity after p-value - display_cols = ["ProteinName", "log2FC", "p-value", "Intensity"] + sample_cols + ["PeptideSequence"] + # 3. Reorder columns: [ProteinName, Intensity(chart), sample_cols..., PeptideSequence] + display_cols = ( + ["ProteinName", "Intensity"] + sample_cols + ["PeptideSequence"] + ) display_df = pivot_df[display_cols] + # 4. Display the dataframe as is without sorting, since statistical columns are removed st.dataframe( - display_df.sort_values("p-value"), + display_df, column_config={ "Intensity": st.column_config.BarChartColumn( "Intensity", @@ -108,4 +116,4 @@ with col2: st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") with col3: - st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") \ No newline at end of file diff --git a/content/results_heatmap.py b/content/results_heatmap.py index 4ece3f4..1106377 100644 --- a/content/results_heatmap.py +++ b/content/results_heatmap.py @@ -1,19 +1,21 @@ """Heatmap Results Page.""" import streamlit as st import numpy as np +import polars as pl import plotly.express as px from scipy.cluster.hierarchy import linkage, leaves_list from scipy.spatial.distance import pdist from src.common.common import page_setup from src.common.results_helpers import get_abundance_data +from openms_insight import Heatmap params = page_setup() st.title("Heatmap") st.markdown( """ -Hierarchically clustered heatmap of protein-level abundance (Z-score normalized). -Proteins and samples are ordered by similarity. +Interactive hierarchically clustered heatmap of protein-level abundance (Z-score normalized). +Powered by OpenMS-Insight multi-resolution engine. """ ) @@ -21,49 +23,79 @@ st.warning("Please initialize your workspace first.") st.stop() +# 1. Use the refactored get_abundance_data function (returns only pivot_df and group_map) result = get_abundance_data(st.session_state["workspace"]) if result is None: st.info("Abundance data not available. Please run the workflow and configure sample groups first.") st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") st.stop() -pivot_df, expr_df, group_map = result +pivot_df, group_map = result -top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n") +if pivot_df.empty: + st.info("No data available for heatmap.") + st.stop() + +# 2. Compute expr_df directly and derive sample columns internally +# Select only the actual sample columns, excluding metadata fields like ProteinName. +sample_cols = [c for c in pivot_df.columns if c not in ["ProteinName", "PeptideSequence", "log2FC", "p-adj", "stat", "p-value"]] +expr_df = pivot_df.set_index("ProteinName")[sample_cols] +# 3. UI settings (number of top variance proteins) +top_n = st.slider("Number of proteins (Highest Variance)", 20, 200, 50, key="heatmap_top_n") + +# 4. Process data (variance selection -> Z-score normalization) var_series = expr_df.var(axis=1) top_proteins = var_series.sort_values(ascending=False).head(top_n).index heatmap_df = expr_df.loc[top_proteins] + +# Compute Z-scores and clean missing/invalid values heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0) heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna() if not heatmap_z.empty: - row_linkage = linkage(pdist(heatmap_z.values), method="average") - row_order = leaves_list(row_linkage) - - col_linkage = linkage(pdist(heatmap_z.T.values), method="average") - col_order = leaves_list(col_linkage) - - heatmap_clustered = heatmap_z.iloc[row_order, col_order] - - fig_heatmap = px.imshow( - heatmap_clustered, - labels=dict(x="Sample", y="Protein", color="Z-score"), - aspect="auto", - color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]], - zmin=-3, zmax=3 + # 5. Melt and convert data to Polars to satisfy OpenMS-Insight component requirements + # Restore the ProteinName row index as a column + heatmap_z_reset = heatmap_z.reset_index() + + # Unpivot the wide-format matrix into long-format (X, Y, Intensity) + melted_df = heatmap_z_reset.melt( + id_vars=["ProteinName"], + value_vars=sample_cols, + var_name="Sample", + value_name="Z_score" ) + + # Add sample group mapping if available for heatmap categories + if group_map: + melted_df["Group"] = melted_df["Sample"].map(group_map) + + # Pack the Pandas DataFrame into a Polars LazyFrame + heatmap_pl_lazy = pl.from_pandas(melted_df).lazy() - fig_heatmap.update_layout( - height=700, - xaxis={'side': 'bottom'}, - yaxis={'side': 'left'} + # 6. Initialize the OpenMS-Insight Heatmap component and map attributes + # Component spec: X axis (Sample), Y axis (ProteinName), color intensity (Z_score) + heatmap_component = Heatmap( + cache_id="quantms_protein_heatmap", + x_column="Sample", + y_column="ProteinName", + data=heatmap_pl_lazy, + intensity_column="Z_score", # πŸ”΄ 이 컬럼 수치둜 색상이 μΉ ν•΄μ Έμ•Ό ν•©λ‹ˆλ‹€. + title="Protein Abundance Heatmap (Z-score)", + x_label="Samples", + y_label="Proteins", + colorscale="RdBu", # Red-Blue μŠ€μΌ€μΌ + reversescale=True, + log_scale=False, # Z-scoreλŠ” μŒμˆ˜κ°€ μžˆμœΌλ―€λ‘œ False μœ μ§€ + intensity_label="Z-score", # λ²”λ‘€ 제λͺ©μ„ Z-score둜 μ§€μ • + category_column=None, + min_points=10000, # κ²©μžκ°€ 잘 ν‘œν˜„λ˜λ„λ‘ 점 개수 μƒν•œμ„ λ„‰λ„‰νžˆ μ§€μ • ) - fig_heatmap.update_xaxes(tickfont=dict(size=10)) - fig_heatmap.update_yaxes(tickfont=dict(size=8)) + # 7. Render the component + state_manager = st.session_state.get("state") + heatmap_component(state_manager=state_manager) - st.plotly_chart(fig_heatmap, use_container_width=True) else: st.warning("Insufficient data to generate the heatmap.") @@ -73,4 +105,4 @@ with col1: st.page_link("content/results_volcano.py", label="Volcano Plot", icon="πŸŒ‹") with col2: - st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") + st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") \ No newline at end of file diff --git a/content/results_pca.py b/content/results_pca.py index 45ea8eb..6f8ebce 100644 --- a/content/results_pca.py +++ b/content/results_pca.py @@ -6,6 +6,7 @@ from sklearn.preprocessing import StandardScaler from src.common.common import page_setup from src.common.results_helpers import get_abundance_data +from openms_insight.components.pca import run_and_plot_pca params = page_setup() st.title("PCA Analysis") @@ -21,67 +22,72 @@ st.warning("Please initialize your workspace first.") st.stop() +# 1. λ³€κ²½λœ get_abundance_data 적용 (λ°˜ν™˜κ°’ 2개: pivot_df, group_map) result = get_abundance_data(st.session_state["workspace"]) if result is None: st.info("Abundance data not available. Please run the workflow and configure sample groups first.") st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") st.stop() -pivot_df, expr_df, group_map = result +_, group_map = result + +if "statistics_df" not in st.session_state or st.session_state["statistics_df"] is None: + st.info("Statistical analysis data not found. Please run the statistical inference first to obtain p-adj values.") + # st.page_link("content/results_statistical.py", label="Go to Statistical Inference", icon="πŸ“Š") + st.stop() + +target_df = st.session_state["statistics_df"] + +# 2. 이 νŽ˜μ΄μ§€μ—μ„œ 직접 expr_df(λ°œν˜„λŸ‰ 맀트릭슀) κ΅¬μΆ•ν•˜κΈ° +# group_map의 ν‚€(μƒ˜ν”Œλͺ…λ“€)λ₯Ό 컬럼으둜 μ‚¬μš©ν•˜μ—¬ λ°œν˜„λŸ‰ λ°μ΄ν„°λ§Œ μΆ”μΆœν•©λ‹ˆλ‹€. +sample_columns = list(group_map.keys()) + +# pivot_df에 λ‹¨λ°±μ§ˆ μ‹λ³„μž(예: ProteinName)와 μƒ˜ν”Œ μ»¬λŸΌλ“€μ΄ ν¬ν•¨λ˜μ–΄ μžˆμ–΄μ•Ό ν•©λ‹ˆλ‹€. +if "ProteinName" in target_df.columns: + expr_df = target_df.set_index("ProteinName")[sample_columns] +elif target_df.index.name == "ProteinName": + expr_df = target_df[sample_columns] +else: + # μ˜ˆμ™Έ λ°©μ§€: ProteinName이 μ»¬λŸΌμ— μ—†κ³  인덱슀 이름도 μ§€μ •λ˜μ§€ μ•Šμ€ 경우 첫 번째 μ»¬λŸΌμ„ 인덱슀둜 κ°€μ • + expr_df = target_df.set_index(target_df.columns[0])[sample_columns] top_n = 500 +# 3. p-value κΈ°μ€€ μƒμœ„ n개 λ‹¨λ°±μ§ˆ 필터링 +# pivot_dfμ—μ„œ μœ μ˜λ―Έν•œ λ‹¨λ°±μ§ˆ 탐색 top_proteins = ( - pivot_df + target_df .dropna(subset=["p-adj"]) .sort_values("p-adj", ascending=True) - .head(top_n)["ProteinName"] + .head(top_n) ) +# λ§Œμ•½ μœ„μ—μ„œ 인덱슀λ₯Ό λ°”κΏ¨λ‹€λ©΄ pivot_df ꡬ쑰에 맞게 λ‹¨λ°±μ§ˆ 이름을 κ°€μ Έμ˜΅λ‹ˆλ‹€. +if "ProteinName" in top_proteins.columns: + top_protein_names = top_proteins["ProteinName"] +else: + top_protein_names = top_proteins.index + expr_df_pca = expr_df.loc[ - expr_df.index.intersection(top_proteins) + expr_df.index.intersection(top_protein_names) ] if expr_df_pca.shape[0] < 2: st.info("Not enough proteins after p-value filtering for PCA.") st.stop() -X = expr_df_pca.T -X_scaled = StandardScaler().fit_transform(X) - -pca = PCA(n_components=2) -pcs = pca.fit_transform(X_scaled) - -pca_df = pd.DataFrame( - pcs, - columns=["PC1", "PC2"], - index=X.index -) - -norm_map = { - k.replace(".mzML", ""): v - for k, v in group_map.items() -} -pca_df["Group"] = pca_df.index.map(norm_map) - -fig_pca = px.scatter( - pca_df, - x="PC1", - y="PC2", - color="Group", - text=pca_df.index, -) - -fig_pca.update_traces(textposition="top center") -fig_pca.update_layout( - xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)", - yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)", - height=600, -) +# 4. OpenMS-Insight λͺ¨λ“ˆ 호좜 파트 +# 이 μ•„λž˜μ˜ μ§€μ €λΆ„ν•œ 계산 및 Plotly μ‹œκ°ν™” μ½”λ“œλ₯Ό μ™ΈλΆ€ λͺ¨λ“ˆλ‘œ μΊ‘μŠν™”ν•˜μ—¬ ν˜ΈμΆœν•©λ‹ˆλ‹€. +try: + # μ •μ˜λœ 뢄석 및 μ‹œκ°ν™” ν•¨μˆ˜ 호좜 + fig_pca, num_proteins = run_and_plot_pca(expr_df_pca, group_map) + + st.plotly_chart(fig_pca, use_container_width=True) + st.markdown(f"**Proteins used:** {num_proteins} (top {top_n} by p-adj)") -st.plotly_chart(fig_pca, use_container_width=True) +except Exception as e: + st.error(f"PCA μ‹œκ°ν™” 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {e}") -st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-adj)") st.markdown("---") st.markdown("**Other visualizations:**") @@ -89,4 +95,4 @@ with col1: st.page_link("content/results_volcano.py", label="Volcano Plot", icon="πŸŒ‹") with col2: - st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") \ No newline at end of file diff --git a/content/results_volcano.py b/content/results_volcano.py index 8502489..1a4a1e9 100644 --- a/content/results_volcano.py +++ b/content/results_volcano.py @@ -1,9 +1,11 @@ """Volcano Plot Results Page.""" import streamlit as st +import polars as pl import plotly.express as px import numpy as np from src.common.common import page_setup from src.common.results_helpers import get_abundance_data +from openms_insight import VolcanoPlot params = page_setup() st.title("Volcano Plot") @@ -19,23 +21,27 @@ st.warning("Please initialize your workspace first.") st.stop() -result = get_abundance_data(st.session_state["workspace"]) -if result is None: - st.info("Abundance data not available. Please run the workflow and configure sample groups first.") - st.page_link("content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹") +# πŸ” 1. Check if statistical analysis results are available in the session state +if "statistics_df" not in st.session_state or st.session_state["statistics_df"] is None: + st.info("Statistical analysis data not found. Please run the statistical engine first.") + # Set the icon link to the actual statistics page file path (e.g. statistical.py). + st.page_link("content/statistical.py", label="Go to Statistical Inference", icon="πŸ”¬") st.stop() -pivot_df, expr_df, group_map = result +# Retrieve the completed statistical analysis DataFrame +statistics_df = st.session_state["statistics_df"] -if pivot_df.empty: +if statistics_df.empty: st.info("No data available for volcano plot.") st.stop() -volcano_df = pivot_df.copy() -volcano_df = volcano_df.dropna(subset=["log2FC", "p-adj"]) - -volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"]) +# 2. Clean data and convert to Polars for component input +# Drop missing values from the required 'log2FC' and 'p-adj' columns in `statistics_df`. +volcano_df = statistics_df.dropna(subset=["log2FC", "p-adj"]).copy() +# Convert the Pandas DataFrame to a Polars LazyFrame for component injection. +volcano_pl_lazy = pl.from_pandas(volcano_df).lazy() +# 3. Configure UI sliders (changing thresholds does not invalidate cache) fc_thresh = st.slider( "log2 Fold Change threshold", min_value=0.5, @@ -52,49 +58,34 @@ step=0.001, ) -volcano_df["Significance"] = "Not significant" -volcano_df.loc[ - (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh), - "Significance", -] = "Up-regulated" - -volcano_df.loc[ - (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh), - "Significance", -] = "Down-regulated" - -fig_volcano = px.scatter( - volcano_df, - x="log2FC", - y="neg_log10_padj", - color="Significance", - hover_data=["ProteinName", "log2FC", "p-value", "p-adj"], - color_discrete_map={ - "Up-regulated": "red", - "Down-regulated": "blue", - "Not significant": "lightgrey", - } +# 4. Initialize the OpenMS-Insight VolcanoPlot component +volcano_plot_component = VolcanoPlot( + cache_id="quantms_volcano_plot", + data=volcano_pl_lazy, + log2fc_column="log2FC", + pvalue_column="p-adj", + label_column="ProteinName", + up_color="#E74C3C", + down_color="#3498DB", + ns_color="#95A5A6", + show_threshold_lines=True, + threshold_line_style="dash", ) -fig_volcano.add_vline(x=fc_thresh, line_dash="dash") -fig_volcano.add_vline(x=-fc_thresh, line_dash="dash") -fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash") +# 5. Render the component +state_manager = st.session_state.get("state") # Inject the project state management object -# Make x-axis symmetric around zero -max_abs_fc = volcano_df["log2FC"].abs().max() -x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding - -fig_volcano.update_layout( - xaxis_title="log2 Fold Change", - yaxis_title="-log10(p-adj)", - xaxis_range=x_range, +volcano_plot_component( + state_manager=state_manager, + fc_threshold=fc_thresh, + p_threshold=p_thresh, + max_labels=10, # Display labels for the top N significant proteins height=600, ) -st.plotly_chart(fig_volcano, use_container_width=True) - -up_count = (volcano_df["Significance"] == "Up-regulated").sum() -down_count = (volcano_df["Significance"] == "Down-regulated").sum() +# 6. Keep the existing statistical summary and bottom links +up_count = ((volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh)).sum() +down_count = ((volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh)).sum() st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}") st.markdown("---") @@ -103,4 +94,4 @@ with col1: st.page_link("content/results_pca.py", label="PCA", icon="πŸ“Š") with col2: - st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") + st.page_link("content/results_heatmap.py", label="Heatmap", icon="πŸ”₯") \ No newline at end of file diff --git a/content/statistical.py b/content/statistical.py new file mode 100644 index 0000000..97ef7ff --- /dev/null +++ b/content/statistical.py @@ -0,0 +1,163 @@ +"""Statistical Inference Page.""" + +from pathlib import Path +import pandas as pd +import polars as pl +import streamlit as st +from src.common.common import page_setup +from src.common.results_helpers import get_abundance_data +# Import statistics engine functions from openms_insight +from openms_insight.analysis.statistics import calculate_statistical_tests, adjust_fdr_lazy + +params = page_setup() +st.title("Statistical Inference") + +st.markdown( + """ +Run differential expression analysis to identify statistically significant proteins across your biological groups. +""" +) + +if "workspace" not in st.session_state: + st.warning("Please initialize your workspace first.") + st.stop() + +# Load primary database assets +result = get_abundance_data(st.session_state["workspace"]) +if result is None: + st.info( + "Abundance data not available. Please run the workflow and configure sample groups first." + ) + st.page_link( + "content/results_abundance.py", label="Go to Abundance", icon="πŸ“‹" + ) + st.stop() + +pivot_df, group_map = result + +# --- STEP 1: Upstream Pipeline Tracker (Fallback Architecture) --- +if ( + "normalized_df" in st.session_state + and st.session_state["normalized_df"] is not None +): + base_df = st.session_state["normalized_df"] + st.info( + "πŸ”„ **Upstream Pipeline Detected**: Using data processed from the **Normalization** step." + ) +elif ( + "imputed_df" in st.session_state + and st.session_state["imputed_df"] is not None +): + base_df = st.session_state["imputed_df"] + st.warning( + "⚠️ **Normalization Skipped**: Using data processed from the **Imputation** step." + ) +elif ( + "filtered_df" in st.session_state + and st.session_state["filtered_df"] is not None +): + base_df = st.session_state["filtered_df"] + st.warning( + "⚠️ **Preprocessing Skipped**: Using data processed from the **Filtering** step." + ) +else: + base_df = pivot_df + st.warning( + "⚠️ **Raw Input Active**: No preprocessing history found. Operating on the original table." + ) + +# 2. Extract actual active sample columns and detect unique biological groups +sample_cols = [ + c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"] +] +unique_groups = sorted(list(set([group_map[s] for s in sample_cols]))) +group_count = len(unique_groups) + +# --- SECTION 1: Active Input Table Preview --- +st.subheader("Input Table Overview") +st.markdown( + f"Currently analyzing **{base_df.shape[0]}** rows across **{len(sample_cols)}** samples belonging to **{group_count} groups** ({', '.join(unique_groups)})." +) +st.dataframe(base_df, use_container_width=True) + +st.markdown("---") + +# --- SECTION 2: Dynamic Statistical Parameter Configuration --- +st.subheader("Configure Statistical Engine") + +# Prepare structural Polars metadata DataFrame required by backend functions +metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols] +metadata_pl = pl.DataFrame( + metadata_rows, schema={"sample_id": pl.String, "group": pl.String} +) + +col1, col2 = st.columns(2) + +with col1: + st.markdown("### πŸ”¬ 1. Hypothesis Testing Method") + + # Route available method options dynamically based on the group count + if group_count == 2: + method_options = ["limma_like", "welch", "paired"] + help_text = "'limma_like' uses Empirical Bayes variance shrinking. 'welch' is for unequal variances. 'paired' is for dependent samples." + elif group_count >= 3: + method_options = ["limma_like", "anova"] + help_text = "'limma_like' supports multi-group design matrices. 'anova' computes standard row-wise One-way ANOVA." + else: + st.error("❌ Statistical testing requires at least 2 unique sample groups.") + st.stop() + + selected_method = st.selectbox( + "Select Statistical Test", + options=method_options, + index=0, + help=help_text + ) + +with col2: + st.markdown("### πŸ›‘οΈ 2. Multiple Testing Correction (FDR)") + selected_fdr = st.selectbox( + "Select FDR Adjustment Strategy", + options=["BH", "Bonferroni", "None"], + index=0, + help="'BH' (Benjamini-Hochberg) controls False Discovery Rate. 'Bonferroni' is strict Family-Wise Error Rate control." + ) + +# --- SECTION 3: Statistical Query Execution --- +st.markdown("
", unsafe_allow_html=True) +if st.button("Run Statistical Analysis", type="primary"): + + # Convert active pandas dataframe into polars lazyframe graph + stats_lazy = pl.from_pandas(base_df).lazy() + + try: + # Execute Chain 1: Calculate core statistics (Adds log2FC, stat, p-value) + stats_lazy = calculate_statistical_tests( + quantification_data=stats_lazy, + metadata=metadata_pl, + method=selected_method + ) + + # Execute Chain 2: Adjust Multiple Testing (Adds p-adj) + stats_lazy = adjust_fdr_lazy( + quantification_data=stats_lazy, + strategy=selected_fdr + ) + + # Resolve lazy graph optimization tree and bring back to pandas memory + statistics_df = stats_lazy.collect().to_pandas() + + # πŸ’Ύ Save processing checkpoint inside Session State for Downstream (e.g., Volcano plot, Volcano/Heatmap UI) + st.session_state["statistics_df"] = statistics_df + + st.success(f"Successfully calculated **{selected_method}** test with **{selected_fdr}** FDR correction!") + + # Display the finalized statistics table view + st.subheader("Statistical Analysis Results") + st.markdown(f"Generated framework containing columns: `ProteinName`, `log2FC`, `stat`, `p-value`, `p-adj`, `PeptideSequence`") + st.dataframe(statistics_df, use_container_width=True) + + except ValueError as val_err: + st.error(f"Engine Validation Fallure: {str(val_err)}") + except Exception as e: + st.error(f"An unexpected pipeline error occurred: {str(e)}") \ No newline at end of file diff --git a/quantms_protein_heatmap/manifest.json b/quantms_protein_heatmap/manifest.json new file mode 100644 index 0000000..c896f9f --- /dev/null +++ b/quantms_protein_heatmap/manifest.json @@ -0,0 +1,49 @@ +{ + "version": 3, + "component_type": "heatmap", + "created_at": "2026-06-25T14:51:08.657547", + "config_hash": "17538bbd7c8bb0b83ef303c424c0f51e7dccd7b704b8b4d6f2f9b91555db0343", + "config": { + "x_column": "Sample", + "y_column": "ProteinName", + "intensity_column": "Z_score", + "min_points": 10000, + "display_aspect_ratio": 1.7777777777777777, + "x_bins": 188, + "y_bins": 106, + "use_simple_downsample": false, + "use_streaming": true, + "categorical_filters": [], + "zoom_identifier": "heatmap_zoom", + "title": "Protein Abundance Heatmap (Z-score)", + "x_label": "Samples", + "y_label": "Proteins", + "colorscale": "RdBu", + "category_column": null, + "log_scale": false, + "low_values_on_top": false, + "intensity_label": "Z-score" + }, + "filters": {}, + "filter_defaults": {}, + "interactivity": {}, + "data_files": { + "level_0": "level_0.parquet", + "level_1": "level_1.parquet" + }, + "data_values": { + "x_range": [ + "01_1", + "10_3" + ], + "y_range": [ + "sp|Biognosys|iRT-Kit_WR_fusion", + "sp|Q12496|YO098_YEAST" + ], + "total": 300, + "level_sizes": [ + 300 + ], + "num_levels": 2 + } +} \ No newline at end of file diff --git a/quantms_protein_heatmap/preprocessed/level_0.parquet b/quantms_protein_heatmap/preprocessed/level_0.parquet new file mode 100644 index 0000000..845428e Binary files /dev/null and b/quantms_protein_heatmap/preprocessed/level_0.parquet differ diff --git a/quantms_protein_heatmap/preprocessed/level_1.parquet b/quantms_protein_heatmap/preprocessed/level_1.parquet new file mode 100644 index 0000000..845428e Binary files /dev/null and b/quantms_protein_heatmap/preprocessed/level_1.parquet differ diff --git a/quantms_volcano_plot/manifest.json b/quantms_volcano_plot/manifest.json new file mode 100644 index 0000000..d572f61 --- /dev/null +++ b/quantms_volcano_plot/manifest.json @@ -0,0 +1,26 @@ +{ + "version": 3, + "component_type": "volcanoplot", + "created_at": "2026-06-25T14:41:42.791178", + "config_hash": "b06bb8d4c9dc289910868a8f1d6fdb18e1deb7feaa21962787f7d334e30f5608", + "config": { + "log2fc_column": "log2FC", + "pvalue_column": "p-adj", + "label_column": "ProteinName", + "title": null, + "x_label": "log2 Fold Change", + "y_label": "-log10(p-value)", + "up_color": "#E74C3C", + "down_color": "#3498DB", + "ns_color": "#95A5A6", + "show_threshold_lines": true, + "threshold_line_style": "dash" + }, + "filters": {}, + "filter_defaults": {}, + "interactivity": {}, + "data_files": { + "volcanoData": "volcanoData.parquet" + }, + "data_values": {} +} \ No newline at end of file diff --git a/quantms_volcano_plot/preprocessed/volcanoData.parquet b/quantms_volcano_plot/preprocessed/volcanoData.parquet new file mode 100644 index 0000000..6e2d7a2 Binary files /dev/null and b/quantms_volcano_plot/preprocessed/volcanoData.parquet differ diff --git a/requirements.txt b/requirements.txt index aac2879..a6b90e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -149,4 +149,5 @@ mygene # Redis Queue dependencies (for online mode) redis>=5.0.0 rq>=1.16.0 -statsmodels \ No newline at end of file +statsmodels +polars \ No newline at end of file diff --git a/src/common/results_helpers.py b/src/common/results_helpers.py index db3e103..d28fe57 100644 --- a/src/common/results_helpers.py +++ b/src/common/results_helpers.py @@ -9,6 +9,7 @@ from pyopenms import IdXMLFile, MSExperiment, MzMLFile from src.workflow.ParameterManager import ParameterManager from statsmodels.stats.multitest import multipletests +from openms_insight.analysis.statistics import calculate_statistical_tests, adjust_fdr_lazy def get_workflow_dir(workspace): """Get the workflow directory path.""" @@ -184,15 +185,14 @@ def build_spectra_cache(mzml_dir: Path, filename_to_index: dict) -> tuple[pl.Dat @st.cache_data -def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: - """Load CSV, compute stats (log2FC, p-value), build pivot_df and expr_df. - - Args: - workspace_path: Path to the workspace directory - csv_mtime: Modification time of CSV file (used as cache key) - - Returns: - Tuple of (pivot_df, expr_df, group_map) or None if data unavailable +def load_abundance_data( + workspace_path: str, + csv_mtime: float, +) -> tuple | None: + """Load a long-format CSV, pivot it into a standard wide-format table + + with sample intensity columns, and return the table along with the group + mapping. """ workflow_dir = get_workflow_dir(Path(workspace_path)) quant_dir = workflow_dir / "results" / "quant_results" @@ -214,7 +214,7 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: if df.empty: return None - # Get group mapping from parameters + # 1. Extract group mapping information from the parameter JSON param_manager = ParameterManager(workflow_dir) params = param_manager.get_parameters_from_json() group_map = { @@ -223,67 +223,41 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: if key.startswith("mzML-group-") and value } + st.write(f"group_map: {group_map}") + if not group_map: return None + # 2. Extract sample names and map to groups df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False) df["Group"] = df["Reference"].map(group_map) df = df.dropna(subset=["Group"]) groups = sorted(df["Group"].unique()) - if len(groups) < 2: return None - group1, group2 = groups[:2] - - # Compute statistics per protein - stats_rows = [] - for protein, protein_df in df.groupby("ProteinName"): - g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values - g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values - - if len(g1_vals) < 2 or len(g2_vals) < 2: - pval = np.nan - else: - _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False) - - mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan - mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan - - log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan - - stats_rows.append({ - "ProteinName": protein, - "log2FC": log2fc, - "p-value": pval, - }) - - stats_df = pd.DataFrame(stats_rows) - - if not stats_df.empty: - mask = stats_df["p-value"].notna() - if mask.any(): - _, p_adj, _, _ = multipletests(stats_df.loc[mask, "p-value"], method="fdr_bh") - stats_df.loc[mask, "p-adj"] = p_adj - else: - stats_df["p-adj"] = np.nan - - # Order samples by group (group2 first, then group1) + # 3. Define ordering and build sample arrays sample_group_df = df[["Sample", "Group"]].drop_duplicates() - group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist() - group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist() - all_samples = group2_samples + group1_samples - - # Build pivot table + group1_samples = sample_group_df[sample_group_df["Group"] == groups[0]][ + "Sample" + ].tolist() + group2_samples = sample_group_df[sample_group_df["Group"] == groups[1]][ + "Sample" + ].tolist() + all_samples = group1_samples + group2_samples + + # 4. Convert from long to wide format (Pivot) and fill missing values with 0.0 pivot_list = [] for protein, group_df in df.groupby("ProteinName"): peptides = ";".join(group_df["PeptideSequence"].unique()) intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict() + + # Fill sample columns (use 0.0 if missing) intensity_dict_complete = { - sample: intensity_dict.get(sample, 0) - for sample in all_samples + sample: intensity_dict.get(sample, 0.0) for sample in all_samples } + row = { "ProteinName": protein, **intensity_dict_complete, @@ -292,16 +266,20 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None: pivot_list.append(row) pivot_df = pd.DataFrame(pivot_list) - pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left") - pivot_df = pivot_df[["ProteinName", "log2FC", "p-value", "p-adj"] + all_samples + ["PeptideSequence"]] - # Build expression matrix (log2-transformed) - expr_df = pivot_df.set_index("ProteinName")[all_samples] - expr_df = expr_df.replace(0, np.nan) - expr_df = np.log2(expr_df + 1) - expr_df = expr_df.dropna() + # 5. Reorder columns to match the required standard format + # Structure: [ProteinName, Sample_1, Sample_2, ..., PeptideSequence] + columns_order = ["ProteinName"] + all_samples + ["PeptideSequence"] + pivot_df = pivot_df[columns_order] + + # 6. Clean up the group_map keys right before returning to the caller + clean_group_map = {} + for k, v in group_map.items(): + clean_key = k[:-5] if k.endswith(".mzML") else k + clean_group_map[clean_key] = v - return pivot_df, expr_df, group_map + # Return final results with the clean group map + return pivot_df, clean_group_map def get_abundance_data(workspace: Path) -> tuple | None: @@ -324,4 +302,4 @@ def get_abundance_data(workspace: Path) -> tuple | None: return None csv_mtime = csv_files[0].stat().st_mtime - return load_abundance_data(str(workspace), csv_mtime) + return load_abundance_data(str(workspace), csv_mtime) \ No newline at end of file