diff --git a/app.py b/app.py
index 194d857..e5e4d0e 100644
--- a/app.py
+++ b/app.py
@@ -23,12 +23,18 @@
st.Page(Path("content", "results_rescoring.py"), title="Rescoring", icon="π"),
st.Page(Path("content", "results_filtered.py"), title="Filtered PSMs", icon="π―"),
st.Page(Path("content", "results_abundance.py"), title="Abundance", icon="π"),
+ st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="π"),
+ st.Page(Path("content", "enrichment.py"), title="Pathway Analysis", icon="π"),
+ ],
+ "Differential Protein Analysis": [
+ st.Page(Path("content", "filtering.py"), title="Filtering", icon="π§Ή"),
+ st.Page(Path("content", "imputation.py"), title="Imputation", icon="π©Ή"),
+ st.Page(Path("content", "normalization.py"), title="Normalization", icon="βοΈ"),
+ st.Page(Path("content", "statistical.py"), title="Statistical", icon="π’"),
st.Page(Path("content", "results_volcano.py"), title="Volcano", icon="π"),
st.Page(Path("content", "results_pca.py"), title="PCA", icon="π"),
st.Page(Path("content", "results_heatmap.py"), title="Heatmap", icon="π₯"),
- st.Page(Path("content", "results_library.py"), title="Spectral Library", icon="π"),
- st.Page(Path("content", "results_proteomicslfq.py"), title="Proteomics LFQ", icon="π§ͺ"),
- ],
+ ]
}
pg = st.navigation(pages)
diff --git a/content/enrichment.py b/content/enrichment.py
new file mode 100644
index 0000000..d8c5f51
--- /dev/null
+++ b/content/enrichment.py
@@ -0,0 +1,140 @@
+"""Pathway Analysis Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+# Import GO Enrichment modules from openms_insight engine
+from openms_insight.analysis.enrichment import calculate_go_enrichment
+
+params = page_setup()
+st.title("GO Enrichment Analysis")
+
+st.markdown(
+ """
+Identify overrepresented biological themes (BP, CC, MF) within your differentially expressed protein features using MyGene.info and Fisher's Exact Test.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+# --- STEP 1: Upstream Statistics Checkpoint ---
+if (
+ "statistics_df" in st.session_state
+ and st.session_state["statistics_df"] is not None
+):
+ final_statistics_report = st.session_state["statistics_df"]
+ st.info(
+ "π **Upstream Pipeline Detected**: Using analyzed matrices from the **Statistical Inference** step."
+ )
+else:
+ st.warning(
+ "β οΈ **Missing Prerequisites**: Statistical inference data not detected. Please run hypothesis testing first."
+ )
+ st.page_link(
+ "content/results_statistics.py", label="Go to Statistical Inference", icon="π¬"
+ )
+ st.stop()
+
+# --- STEP 2: Preprocessing Mapping Key Configuration ---
+# Identify target identifier columns dynamically
+id_col = "ProteinName"
+if id_col not in final_statistics_report.columns:
+ st.error(f"β Structural Error: Column '{id_col}' is missing from the active matrix context.")
+ st.stop()
+
+# --- SECTION 1: Parameter Setup & Dynamic Cutoff Labels ---
+st.subheader("Configure Enrichment Thresholds")
+
+# Check if target p-value should be adjusted or raw based on previous selections (Fallback safely to 'p-adj')
+target_p_col = "p-adj" if "p-adj" in final_statistics_report.columns else "p-value"
+p_label = (
+ "Adjusted P-value (p-adj) Cutoff"
+ if target_p_col == "p-adj"
+ else "Raw P-value (p-value) Cutoff"
+)
+
+ui_go_col1, ui_go_col2 = st.columns(2)
+
+with ui_go_col1:
+ p_cutoff = st.number_input(
+ f"π¬ {p_label}",
+ min_value=0.0001,
+ max_value=1.0,
+ value=0.05,
+ step=0.01,
+ format="%.4f",
+ help="Proteins with significance metrics below this value are mapped to the foreground cohort.",
+ )
+
+with ui_go_col2:
+ fc_cutoff = st.number_input(
+ "π Absolute Difference Cutoff (|log2FC|)",
+ min_value=0.0,
+ max_value=10.0,
+ value=1.0,
+ step=0.1,
+ format="%.2f",
+ help="Proteins with absolute log2 fold change greater than or equal to this threshold will be selected.",
+ )
+
+# --- SECTION 2: Execution and Interactive View Charts ---
+st.markdown("
", unsafe_allow_html=True)
+if st.button("π Run GO Enrichment Analysis", type="primary", key="run_go_analysis"):
+
+ with st.spinner("Querying MyGene.info API & executing hyper-geometric calculation loops..."):
+ # Convert internal pandas DataFrame to openms_insight Polars DataFrame expectation
+ stats_pl = pl.from_pandas(final_statistics_report)
+
+ status, output = calculate_go_enrichment(
+ final_report=stats_pl,
+ id_col=id_col,
+ target_p_col=target_p_col,
+ p_cutoff=p_cutoff,
+ fc_cutoff=fc_cutoff,
+ )
+
+ # Route response structures based on analysis output status code
+ if status == "empty_data":
+ st.error("β No valid statistical rows found containing standard columns to run GO alignment.")
+
+ elif status == "insufficient_proteins":
+ st.warning(
+ f"β οΈ Not enough significant proteins found to construct target datasets. "
+ f"(Criteria: {target_p_col} < {p_cutoff:.4f}, |log2FC| β₯ {fc_cutoff:.2f})."
+ )
+ st.info(f"π‘ Found significant proteins count: **{output}**. Try relaxing your p-value or log2FC filters.")
+
+ elif status == "success":
+ st.success("β GO Enrichment Analysis completed successfully!")
+
+ # Display operational matrix scale
+ st.markdown(
+ f"π **Analysis Profile Scope**: Mapped **{output['fg_count']}** significant foreground profiles out of **{output['bg_count']}** reference background items."
+ )
+
+ # Build multi-tab interface layer for ontology subcategories
+ tabs = st.tabs([
+ "𧬠Biological Process (BP)",
+ "π¬ Cellular Component (CC)",
+ "π§ͺ Molecular Function (MF)"
+ ])
+ categories_data = output["categories"]
+
+ for idx, go_type in enumerate(["BP", "CC", "MF"]):
+ with tabs[idx]:
+ fig = categories_data[go_type]["fig"]
+ df_go = categories_data[go_type]["df"]
+
+ if fig is not None and df_go is not None:
+ # Render plotly bar figures generated straight from backend engine
+ st.plotly_chart(fig, use_container_width=True)
+
+ st.subheader(f"π {go_type} Results Dataframe")
+ st.dataframe(df_go, use_container_width=True)
+ else:
+ st.info(f"No statistically overrepresented terms identified for Category: **{go_type}**")
\ No newline at end of file
diff --git a/content/filtering.py b/content/filtering.py
new file mode 100644
index 0000000..402c29b
--- /dev/null
+++ b/content/filtering.py
@@ -0,0 +1,163 @@
+"""Filtering Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+# Import filtering functions from openms_insight package
+from openms_insight.analysis.filter import (
+ filter_low_abundance,
+ filter_low_repeatability,
+ filter_low_variance,
+)
+
+params = page_setup()
+st.title("Data Filtering")
+
+st.markdown(
+ """
+Filter out low-quality proteins from your dataset based on abundance, repeatability, or variance thresholds.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info(
+ "Abundance data not available. Please run the workflow and configure sample groups first."
+ )
+ st.page_link(
+ "content/results_abundance.py", label="Go to Abundance", icon="π"
+ )
+ st.stop()
+
+pivot_df, group_map = result
+
+# 1. Identify actual sample columns dynamically
+sample_cols = [
+ c
+ for c in pivot_df.columns
+ if c not in ["ProteinName", "PeptideSequence"]
+]
+
+# --- SECTION 1: Original Data View ---
+st.subheader("Original Abundance Table")
+st.markdown(
+ f"Currently displaying **{pivot_df.shape[0]}** proteins and **{len(sample_cols)}** samples before filtering."
+)
+st.dataframe(pivot_df, use_container_width=True)
+
+st.markdown("---")
+
+# --- SECTION 2: Filter Configuration ---
+st.subheader("Configure Filter Engine")
+
+# Prepare Polars Metadata DataFrame required by openms_insight functions
+metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
+metadata_pl = pl.DataFrame(
+ metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
+)
+
+# User selection for filtering strategy
+filter_method = st.selectbox(
+ "Select Filtering Method",
+ options=["Low Abundance", "Low Repeatability", "Low Variance"],
+ index=0,
+ help="Choose the statistical criteria to prune unreliable protein entries.",
+)
+
+# Render threshold sliders dynamically based on the selected filter method
+if filter_method == "Low Abundance":
+ st.markdown(
+ "**Low Abundance Filter**: Keeps rows where at least one group's median is above the selected percentile threshold."
+ )
+ threshold = st.slider(
+ "Threshold Percentile (%)",
+ min_value=0.0,
+ max_value=100.0,
+ value=10.0,
+ step=5.0,
+ )
+
+elif filter_method == "Low Repeatability":
+ st.markdown(
+ "**Low Repeatability Filter**: Keeps rows where at least one group has a missing value ratio within the allowed maximum."
+ )
+ threshold = st.slider(
+ "Max Missing Ratio",
+ min_value=0.0,
+ max_value=100.0,
+ value=50.0,
+ step=5.0,
+ help="Allowed missing value (zero or null) ratio per group.",
+ )
+
+elif filter_method == "Low Variance":
+ st.markdown(
+ "**Low Variance Filter**: Keeps rows where at least one group's variance is above the selected percentile threshold."
+ )
+ threshold = st.slider(
+ "Threshold Percentile (%)",
+ min_value=0.0,
+ max_value=100.0,
+ value=10.0,
+ step=5.0,
+ )
+
+# --- SECTION 3: Filter Execution and Collected Results View ---
+if st.button("Apply Filter", type="primary"):
+ # Convert the original Pandas DataFrame into a Polars LazyFrame graph
+ quant_lazy = pl.from_pandas(pivot_df).lazy()
+
+ # Route execution to the chosen openms_insight engine function
+ if filter_method == "Low Abundance":
+ filtered_lazy = filter_low_abundance(
+ quantification_data=quant_lazy,
+ metadata=metadata_pl,
+ group_column="group",
+ threshold_percentile=threshold,
+ )
+ elif filter_method == "Low Repeatability":
+ # Convert percent slider input to ratio expected by the function (e.g., 50.0% -> 0.5)
+ filtered_lazy = filter_low_repeatability(
+ quantification_data=quant_lazy,
+ metadata=metadata_pl,
+ group_column="group",
+ max_missing_ratio=threshold / 100.0,
+ )
+ elif filter_method == "Low Variance":
+ filtered_lazy = filter_low_variance(
+ quantification_data=quant_lazy,
+ metadata=metadata_pl,
+ group_column="group",
+ threshold_percentile=threshold,
+ )
+
+ # Collect the evaluated lazy graph and convert back to Pandas for visualization
+ filtered_df = filtered_lazy.collect().to_pandas()
+ st.session_state["filtered_df"] = filtered_df
+
+ # Layout response metrics and the filtered matrix
+ st.success(f"Successfully applied **{filter_method}** filter!")
+
+ # Display dataset scale compression stats
+ col1, col2, col3 = st.columns(3)
+ col1.metric("Original Proteins", pivot_df.shape[0])
+ col2.metric("Filtered Proteins", filtered_df.shape[0])
+ col3.metric(
+ "Removed Proteins", pivot_df.shape[0] - filtered_df.shape[0], delta=None
+ )
+
+ st.subheader("Filtered Abundance Table")
+ if filtered_df.empty:
+ st.warning(
+ "The filtered table is empty. Try relaxing the threshold constraints."
+ )
+ else:
+ st.dataframe(filtered_df, use_container_width=True)
\ No newline at end of file
diff --git a/content/imputation.py b/content/imputation.py
new file mode 100644
index 0000000..1cbbb5e
--- /dev/null
+++ b/content/imputation.py
@@ -0,0 +1,134 @@
+"""Imputation Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+
+# Import imputation algorithms from openms_insight engine
+from openms_insight.analysis.imputation import impute_mar, impute_smallest_value
+
+params = page_setup()
+st.title("Missing Value Imputation")
+
+st.markdown(
+ """
+Handle missing values (zeros or nulls) in your quantification matrix using biological group-aware (MAR) or absolute lowest limit (MNAR) techniques.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+# Load base dataset and clean dictionary keys
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info(
+ "Abundance data not available. Please run the workflow and configure sample groups first."
+ )
+ st.page_link(
+ "content/results_abundance.py", label="Go to Abundance", icon="π"
+ )
+ st.stop()
+
+pivot_df, group_map = result
+
+# 1. Pipeline Checkpoint: Fetch upstream filtered data if available, fallback to raw pivot matrix
+if "filtered_df" in st.session_state and st.session_state["filtered_df"] is not None:
+ base_df = st.session_state["filtered_df"]
+ st.info(
+ "π **Upstream Pipeline Detected**: Using data processed from the **Filtering** step."
+ )
+else:
+ base_df = pivot_df
+ st.warning(
+ "β οΈ **Raw Input Active**: No filtering history found. Operating on the original unfiltered table."
+ )
+
+# 2. Identify actual sample columns dynamically based on the current active matrix
+sample_cols = [
+ c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"]
+]
+
+# --- SECTION 1: Input Matrix Summary ---
+st.subheader("Input Matrix Overview")
+st.markdown(
+ f"Currently analyzing **{base_df.shape[0]}** rows across **{len(sample_cols)}** samples before imputation."
+)
+st.dataframe(base_df, use_container_width=True)
+
+st.markdown("---")
+
+# --- SECTION 2: Imputation Configuration ---
+st.subheader("Configure Imputation Engine")
+
+# Build Polars structural metadata DataFrame
+metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
+metadata_pl = pl.DataFrame(
+ metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
+)
+
+# User selection for core missingness assumption strategy
+impute_category = st.selectbox(
+ "Select Imputation Class",
+ options=["MAR (Missing At Random)", "MNAR (Missing Not At Random)"],
+ index=0,
+ help="MAR uses group metrics (Mean/Median). MNAR shifts values below the limit of detection.",
+)
+
+# Render algorithmic options sub-menus based on the parent selection
+if impute_category == "MAR (Missing At Random)":
+ st.markdown(
+ "**Group Character Imputation**: Fills missing metrics leveraging sample properties belonging to the same group."
+ )
+ strategy_opt = st.radio(
+ "Mathematical Strategy",
+ options=["median", "mean"],
+ index=0,
+ horizontal=True,
+ )
+
+elif impute_category == "MNAR (Missing Not At Random)":
+ st.markdown(
+ "**Smallest Value Imputation**: Replaces missing items with the minimum values detected to reflect technical dropout limits."
+ )
+ scope_opt = st.radio(
+ "Detection Minimum Scope",
+ options=["row", "global"],
+ index=0,
+ horizontal=True,
+ help="'row' targets current protein minimum; 'global' searches the entire mass spectrometry matrix profile.",
+ )
+
+# --- SECTION 3: Imputation Execution ---
+if st.button("Apply Imputation", type="primary"):
+ # Initialize optimization pipeline graph via lazy loading conversion
+ quant_lazy = pl.from_pandas(base_df).lazy()
+
+ # Route configuration matrix parameters to designated engine function channels
+ if impute_category == "MAR (Missing At Random)":
+ imputed_lazy = impute_mar(
+ quantification_data=quant_lazy,
+ metadata=metadata_pl,
+ group_column="group",
+ strategy=strategy_opt,
+ )
+ elif impute_category == "MNAR (Missing Not At Random)":
+ imputed_lazy = impute_smallest_value(
+ quantification_data=quant_lazy, metadata=metadata_pl, scope=scope_opt
+ )
+
+ # Resolve lazy graph optimization tree and push to display data frame structure
+ imputed_df = imputed_lazy.collect().to_pandas()
+
+ # πΎ Save current output into Session State for down-stream processing (Normalization, Statistics)
+ st.session_state["imputed_df"] = imputed_df
+
+ st.success(f"Successfully finalized **{impute_category}** imputation step!")
+
+ # Calculate and display a quick performance matrix check
+ st.subheader("Imputed Result Table")
+ st.dataframe(imputed_df, use_container_width=True)
\ No newline at end of file
diff --git a/content/normalization.py b/content/normalization.py
new file mode 100644
index 0000000..a064598
--- /dev/null
+++ b/content/normalization.py
@@ -0,0 +1,182 @@
+"""Normalization Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+# Import normalization engine functions from openms_insight
+from openms_insight.analysis.normalization import (
+ normalize_samples,
+ scale_data,
+ transform_data,
+)
+
+params = page_setup()
+st.title("Data Normalization & Scaling")
+
+st.markdown(
+ """
+Standardize and transform your protein abundance profiles to correct for technical variations and optimize statistical distributions.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+# Load primary database assets
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info(
+ "Abundance data not available. Please run the workflow and configure sample groups first."
+ )
+ st.page_link(
+ "content/results_abundance.py", label="Go to Abundance", icon="π"
+ )
+ st.stop()
+
+pivot_df, group_map = result
+
+# --- STEP 1: Upstream Pipeline Tracker (Fallback Architecture) ---
+if (
+ "imputed_df" in st.session_state
+ and st.session_state["imputed_df"] is not None
+):
+ base_df = st.session_state["imputed_df"]
+ st.info(
+ "π **Upstream Pipeline Detected**: Using data processed from the **Imputation** step."
+ )
+elif (
+ "filtered_df" in st.session_state
+ and st.session_state["filtered_df"] is not None
+):
+ base_df = st.session_state["filtered_df"]
+ st.warning(
+ "β οΈ **Imputation Skipped**: Using data processed from the **Filtering** step."
+ )
+else:
+ base_df = pivot_df
+ st.warning(
+ "β οΈ **Raw Input Active**: No preprocessing history found. Operating on the original unfiltered table."
+ )
+
+# 2. Extract actual active sample columns dynamically
+sample_cols = [
+ c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"]
+]
+
+# --- SECTION 1: Active Input Table Preview ---
+st.subheader("Input Table Overview")
+st.markdown(
+ f"Currently displaying **{base_df.shape[0]}** rows and **{len(sample_cols)}** samples entering the normalization block."
+)
+st.dataframe(base_df, use_container_width=True)
+
+st.markdown("---")
+
+# --- SECTION 2: Normalization Parameter Configuration ---
+st.subheader("Configure Preprocessing & Scaling Chains")
+
+# Prepare structural Polars metadata DataFrame required by backend functions
+metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
+metadata_pl = pl.DataFrame(
+ metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
+)
+
+col1, col2, col3 = st.columns(3)
+
+with col1:
+ st.markdown("### 𧬠1. Mathematical Transformation")
+ transform_strategy = st.selectbox(
+ "Select Transformation",
+ options=["None", "log2", "log10", "square_root", "cube_root"],
+ index=0,
+ help="Compress data dynamic range and stabilize heteroscedastic variance profiles.",
+ )
+
+with col2:
+ st.markdown("### π§ͺ 2. Sample Normalization")
+ norm_strategy = st.selectbox(
+ "Select Normalization",
+ options=["None", "sum", "median", "pqn", "reference_feature", "quantile"],
+ index=0,
+ help="Perform column-wise corrections to account for variable sample loading concentrations.",
+ )
+
+ # Conditionally display target input field for reference feature matching
+ ref_feature_input = None
+ if norm_strategy == "reference_feature":
+ ref_feature_input = st.text_input(
+ "Reference Protein Name (ID)",
+ value="",
+ placeholder="e.g., P01234 or GAPDH",
+ help="Enter the exact unique identifier string matching a key inside the 'ProteinName' column.",
+ )
+
+with col3:
+ st.markdown("### π 3. Row Scaling")
+ scaling_strategy = st.selectbox(
+ "Select Scaling Mode",
+ options=["None", "mean_centering", "auto_scaling", "pareto_scaling", "range_scaling"],
+ index=0,
+ help="Adjust individual feature weights to make low and high abundance proteins comparable.",
+ )
+
+
+# --- SECTION 3: Normalization Pipe Sequential Execution ---
+st.markdown("
", unsafe_allow_html=True)
+if st.button("Apply Normalization Pipelines", type="primary"):
+
+ # Validate reference feature selection if active before hitting polars execution layers
+ if norm_strategy == "reference_feature" and not ref_feature_input:
+ st.error(
+ "β Validation Error: Please provide a valid Reference Protein Name to use the 'reference_feature' strategy."
+ )
+ st.stop()
+
+ # Convert pandas memory buffer into optimization lazy dataframe tree graph
+ processing_lazy = pl.from_pandas(base_df).lazy()
+
+ # Execute Chain 1: Transform Matrix Data
+ try:
+ processing_lazy = transform_data(
+ quantification_data=processing_lazy,
+ metadata=metadata_pl,
+ strategy=transform_strategy,
+ )
+
+ # Execute Chain 2: Normalize Sample Intensities (Columns)
+ processing_lazy = normalize_samples(
+ quantification_data=processing_lazy,
+ metadata=metadata_pl,
+ strategy=norm_strategy,
+ id_col="ProteinName",
+ reference_feature=ref_feature_input if norm_strategy == "reference_feature" else None,
+ )
+
+ # Execute Chain 3: Scale Individual Features (Rows)
+ processing_lazy = scale_data(
+ quantification_data=processing_lazy,
+ metadata=metadata_pl,
+ strategy=scaling_strategy,
+ )
+
+ # Finalize and collect pipeline query graph optimizations
+ normalized_df = processing_lazy.collect().to_pandas()
+
+ # πΎ Save processing checkpoint inside Session State for Downstream (Statistics Block)
+ st.session_state["normalized_df"] = normalized_df
+
+ st.success("Successfully executed all selected normalization pipelines!")
+
+ # Display the finalized transformation matrix view
+ st.subheader("Normalized Abundance Table")
+ st.dataframe(normalized_df, use_container_width=True)
+
+ except ValueError as val_err:
+ # Gracefully handle validation failures raised from the engine layers (e.g., missing reference protein)
+ st.error(f"Engine Configuration Error: {str(val_err)}")
+ except Exception as e:
+ st.error(f"An unexpected pipeline error occurred: {str(e)}")
\ No newline at end of file
diff --git a/content/results_abundance.py b/content/results_abundance.py
index a7ff453..b391fc2 100644
--- a/content/results_abundance.py
+++ b/content/results_abundance.py
@@ -59,25 +59,33 @@
st.page_link("content/workflow_configure.py", label="Go to Configure", icon="βοΈ")
st.stop()
- pivot_df, expr_df, group_map = result
+ pivot_df, group_map = result
- # Display group comparison info
- groups = sorted(set(group_map.values()))
- if len(groups) >= 2:
- group1, group2 = sorted(groups)[:2]
- st.info(f"Statistical comparison: **{group2} vs {group1}**")
+ # st.write("------------ pivot_df -------------")
+ # st.write(pivot_df)
- # Get sample columns (between stats and PeptideSequence)
- sample_cols = [c for c in pivot_df.columns if c not in ["ProteinName", "log2FC", "p-value", "PeptideSequence"]]
+ # st.write("----------group_map-------------")
+ # st.write(group_map)
+ # 1. Dynamically extract actual sample columns, excluding ProteinName and PeptideSequence
+ sample_cols = [
+ c
+ for c in pivot_df.columns
+ if c not in ["ProteinName", "PeptideSequence"]
+ ]
+
+ # 2. Combine values from sample columns to create an 'Intensity' list column for the bar chart
pivot_df["Intensity"] = pivot_df[sample_cols].apply(list, axis=1)
- # Reorder columns: place Intensity after p-value
- display_cols = ["ProteinName", "log2FC", "p-value", "Intensity"] + sample_cols + ["PeptideSequence"]
+ # 3. Reorder columns: [ProteinName, Intensity(chart), sample_cols..., PeptideSequence]
+ display_cols = (
+ ["ProteinName", "Intensity"] + sample_cols + ["PeptideSequence"]
+ )
display_df = pivot_df[display_cols]
+ # 4. Display the dataframe as is without sorting, since statistical columns are removed
st.dataframe(
- display_df.sort_values("p-value"),
+ display_df,
column_config={
"Intensity": st.column_config.BarChartColumn(
"Intensity",
@@ -108,4 +116,4 @@
with col2:
st.page_link("content/results_pca.py", label="PCA", icon="π")
with col3:
- st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
\ No newline at end of file
diff --git a/content/results_heatmap.py b/content/results_heatmap.py
index 4ece3f4..1106377 100644
--- a/content/results_heatmap.py
+++ b/content/results_heatmap.py
@@ -1,19 +1,21 @@
"""Heatmap Results Page."""
import streamlit as st
import numpy as np
+import polars as pl
import plotly.express as px
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
from src.common.common import page_setup
from src.common.results_helpers import get_abundance_data
+from openms_insight import Heatmap
params = page_setup()
st.title("Heatmap")
st.markdown(
"""
-Hierarchically clustered heatmap of protein-level abundance (Z-score normalized).
-Proteins and samples are ordered by similarity.
+Interactive hierarchically clustered heatmap of protein-level abundance (Z-score normalized).
+Powered by OpenMS-Insight multi-resolution engine.
"""
)
@@ -21,49 +23,79 @@
st.warning("Please initialize your workspace first.")
st.stop()
+# 1. Use the refactored get_abundance_data function (returns only pivot_df and group_map)
result = get_abundance_data(st.session_state["workspace"])
if result is None:
st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
st.stop()
-pivot_df, expr_df, group_map = result
+pivot_df, group_map = result
-top_n = st.slider("Number of proteins", 20, 200, 50, key="heatmap_top_n")
+if pivot_df.empty:
+ st.info("No data available for heatmap.")
+ st.stop()
+
+# 2. Compute expr_df directly and derive sample columns internally
+# Select only the actual sample columns, excluding metadata fields like ProteinName.
+sample_cols = [c for c in pivot_df.columns if c not in ["ProteinName", "PeptideSequence", "log2FC", "p-adj", "stat", "p-value"]]
+expr_df = pivot_df.set_index("ProteinName")[sample_cols]
+# 3. UI settings (number of top variance proteins)
+top_n = st.slider("Number of proteins (Highest Variance)", 20, 200, 50, key="heatmap_top_n")
+
+# 4. Process data (variance selection -> Z-score normalization)
var_series = expr_df.var(axis=1)
top_proteins = var_series.sort_values(ascending=False).head(top_n).index
heatmap_df = expr_df.loc[top_proteins]
+
+# Compute Z-scores and clean missing/invalid values
heatmap_z = heatmap_df.sub(heatmap_df.mean(axis=1), axis=0).div(heatmap_df.std(axis=1), axis=0)
heatmap_z = heatmap_z.replace([np.inf, -np.inf], np.nan).dropna()
if not heatmap_z.empty:
- row_linkage = linkage(pdist(heatmap_z.values), method="average")
- row_order = leaves_list(row_linkage)
-
- col_linkage = linkage(pdist(heatmap_z.T.values), method="average")
- col_order = leaves_list(col_linkage)
-
- heatmap_clustered = heatmap_z.iloc[row_order, col_order]
-
- fig_heatmap = px.imshow(
- heatmap_clustered,
- labels=dict(x="Sample", y="Protein", color="Z-score"),
- aspect="auto",
- color_continuous_scale=[[0.0, "#3b6fb6"], [0.5, "white"], [1.0, "#b40426"]],
- zmin=-3, zmax=3
+ # 5. Melt and convert data to Polars to satisfy OpenMS-Insight component requirements
+ # Restore the ProteinName row index as a column
+ heatmap_z_reset = heatmap_z.reset_index()
+
+ # Unpivot the wide-format matrix into long-format (X, Y, Intensity)
+ melted_df = heatmap_z_reset.melt(
+ id_vars=["ProteinName"],
+ value_vars=sample_cols,
+ var_name="Sample",
+ value_name="Z_score"
)
+
+ # Add sample group mapping if available for heatmap categories
+ if group_map:
+ melted_df["Group"] = melted_df["Sample"].map(group_map)
+
+ # Pack the Pandas DataFrame into a Polars LazyFrame
+ heatmap_pl_lazy = pl.from_pandas(melted_df).lazy()
- fig_heatmap.update_layout(
- height=700,
- xaxis={'side': 'bottom'},
- yaxis={'side': 'left'}
+ # 6. Initialize the OpenMS-Insight Heatmap component and map attributes
+ # Component spec: X axis (Sample), Y axis (ProteinName), color intensity (Z_score)
+ heatmap_component = Heatmap(
+ cache_id="quantms_protein_heatmap",
+ x_column="Sample",
+ y_column="ProteinName",
+ data=heatmap_pl_lazy,
+ intensity_column="Z_score", # π΄ μ΄ μ»¬λΌ μμΉλ‘ μμμ΄ μΉ ν΄μ ΈμΌ ν©λλ€.
+ title="Protein Abundance Heatmap (Z-score)",
+ x_label="Samples",
+ y_label="Proteins",
+ colorscale="RdBu", # Red-Blue μ€μΌμΌ
+ reversescale=True,
+ log_scale=False, # Z-scoreλ μμκ° μμΌλ―λ‘ False μ μ§
+ intensity_label="Z-score", # λ²λ‘ μ λͺ©μ Z-scoreλ‘ μ§μ
+ category_column=None,
+ min_points=10000, # 격μκ° μ ννλλλ‘ μ κ°μ μνμ λλν μ§μ
)
- fig_heatmap.update_xaxes(tickfont=dict(size=10))
- fig_heatmap.update_yaxes(tickfont=dict(size=8))
+ # 7. Render the component
+ state_manager = st.session_state.get("state")
+ heatmap_component(state_manager=state_manager)
- st.plotly_chart(fig_heatmap, use_container_width=True)
else:
st.warning("Insufficient data to generate the heatmap.")
@@ -73,4 +105,4 @@
with col1:
st.page_link("content/results_volcano.py", label="Volcano Plot", icon="π")
with col2:
- st.page_link("content/results_pca.py", label="PCA", icon="π")
+ st.page_link("content/results_pca.py", label="PCA", icon="π")
\ No newline at end of file
diff --git a/content/results_pca.py b/content/results_pca.py
index 45ea8eb..6f8ebce 100644
--- a/content/results_pca.py
+++ b/content/results_pca.py
@@ -6,6 +6,7 @@
from sklearn.preprocessing import StandardScaler
from src.common.common import page_setup
from src.common.results_helpers import get_abundance_data
+from openms_insight.components.pca import run_and_plot_pca
params = page_setup()
st.title("PCA Analysis")
@@ -21,67 +22,72 @@
st.warning("Please initialize your workspace first.")
st.stop()
+# 1. λ³κ²½λ get_abundance_data μ μ© (λ°νκ° 2κ°: pivot_df, group_map)
result = get_abundance_data(st.session_state["workspace"])
if result is None:
st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
st.stop()
-pivot_df, expr_df, group_map = result
+_, group_map = result
+
+if "statistics_df" not in st.session_state or st.session_state["statistics_df"] is None:
+ st.info("Statistical analysis data not found. Please run the statistical inference first to obtain p-adj values.")
+ # st.page_link("content/results_statistical.py", label="Go to Statistical Inference", icon="π")
+ st.stop()
+
+target_df = st.session_state["statistics_df"]
+
+# 2. μ΄ νμ΄μ§μμ μ§μ expr_df(λ°νλ λ§€νΈλ¦μ€) ꡬμΆνκΈ°
+# group_mapμ ν€(μνλͺ
λ€)λ₯Ό 컬λΌμΌλ‘ μ¬μ©νμ¬ λ°νλ λ°μ΄ν°λ§ μΆμΆν©λλ€.
+sample_columns = list(group_map.keys())
+
+# pivot_dfμ λ¨λ°±μ§ μλ³μ(μ: ProteinName)μ μν 컬λΌλ€μ΄ ν¬ν¨λμ΄ μμ΄μΌ ν©λλ€.
+if "ProteinName" in target_df.columns:
+ expr_df = target_df.set_index("ProteinName")[sample_columns]
+elif target_df.index.name == "ProteinName":
+ expr_df = target_df[sample_columns]
+else:
+ # μμΈ λ°©μ§: ProteinNameμ΄ μ»¬λΌμ μκ³ μΈλ±μ€ μ΄λ¦λ μ§μ λμ§ μμ κ²½μ° μ²« λ²μ§Έ 컬λΌμ μΈλ±μ€λ‘ κ°μ
+ expr_df = target_df.set_index(target_df.columns[0])[sample_columns]
top_n = 500
+# 3. p-value κΈ°μ€ μμ nκ° λ¨λ°±μ§ νν°λ§
+# pivot_dfμμ μ μλ―Έν λ¨λ°±μ§ νμ
top_proteins = (
- pivot_df
+ target_df
.dropna(subset=["p-adj"])
.sort_values("p-adj", ascending=True)
- .head(top_n)["ProteinName"]
+ .head(top_n)
)
+# λ§μ½ μμμ μΈλ±μ€λ₯Ό λ°κΏ¨λ€λ©΄ pivot_df ꡬ쑰μ λ§κ² λ¨λ°±μ§ μ΄λ¦μ κ°μ Έμ΅λλ€.
+if "ProteinName" in top_proteins.columns:
+ top_protein_names = top_proteins["ProteinName"]
+else:
+ top_protein_names = top_proteins.index
+
expr_df_pca = expr_df.loc[
- expr_df.index.intersection(top_proteins)
+ expr_df.index.intersection(top_protein_names)
]
if expr_df_pca.shape[0] < 2:
st.info("Not enough proteins after p-value filtering for PCA.")
st.stop()
-X = expr_df_pca.T
-X_scaled = StandardScaler().fit_transform(X)
-
-pca = PCA(n_components=2)
-pcs = pca.fit_transform(X_scaled)
-
-pca_df = pd.DataFrame(
- pcs,
- columns=["PC1", "PC2"],
- index=X.index
-)
-
-norm_map = {
- k.replace(".mzML", ""): v
- for k, v in group_map.items()
-}
-pca_df["Group"] = pca_df.index.map(norm_map)
-
-fig_pca = px.scatter(
- pca_df,
- x="PC1",
- y="PC2",
- color="Group",
- text=pca_df.index,
-)
-
-fig_pca.update_traces(textposition="top center")
-fig_pca.update_layout(
- xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)",
- yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)",
- height=600,
-)
+# 4. OpenMS-Insight λͺ¨λ νΈμΆ ννΈ
+# μ΄ μλμ μ§μ λΆν κ³μ° λ° Plotly μκ°ν μ½λλ₯Ό μΈλΆ λͺ¨λλ‘ μΊ‘μννμ¬ νΈμΆν©λλ€.
+try:
+ # μ μλ λΆμ λ° μκ°ν ν¨μ νΈμΆ
+ fig_pca, num_proteins = run_and_plot_pca(expr_df_pca, group_map)
+
+ st.plotly_chart(fig_pca, use_container_width=True)
+ st.markdown(f"**Proteins used:** {num_proteins} (top {top_n} by p-adj)")
-st.plotly_chart(fig_pca, use_container_width=True)
+except Exception as e:
+ st.error(f"PCA μκ°ν μ€ μ€λ₯κ° λ°μνμ΅λλ€: {e}")
-st.markdown(f"**Proteins used:** {expr_df_pca.shape[0]} (top {top_n} by p-adj)")
st.markdown("---")
st.markdown("**Other visualizations:**")
@@ -89,4 +95,4 @@
with col1:
st.page_link("content/results_volcano.py", label="Volcano Plot", icon="π")
with col2:
- st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
\ No newline at end of file
diff --git a/content/results_volcano.py b/content/results_volcano.py
index 8502489..1a4a1e9 100644
--- a/content/results_volcano.py
+++ b/content/results_volcano.py
@@ -1,9 +1,11 @@
"""Volcano Plot Results Page."""
import streamlit as st
+import polars as pl
import plotly.express as px
import numpy as np
from src.common.common import page_setup
from src.common.results_helpers import get_abundance_data
+from openms_insight import VolcanoPlot
params = page_setup()
st.title("Volcano Plot")
@@ -19,23 +21,27 @@
st.warning("Please initialize your workspace first.")
st.stop()
-result = get_abundance_data(st.session_state["workspace"])
-if result is None:
- st.info("Abundance data not available. Please run the workflow and configure sample groups first.")
- st.page_link("content/results_abundance.py", label="Go to Abundance", icon="π")
+# π 1. Check if statistical analysis results are available in the session state
+if "statistics_df" not in st.session_state or st.session_state["statistics_df"] is None:
+ st.info("Statistical analysis data not found. Please run the statistical engine first.")
+ # Set the icon link to the actual statistics page file path (e.g. statistical.py).
+ st.page_link("content/statistical.py", label="Go to Statistical Inference", icon="π¬")
st.stop()
-pivot_df, expr_df, group_map = result
+# Retrieve the completed statistical analysis DataFrame
+statistics_df = st.session_state["statistics_df"]
-if pivot_df.empty:
+if statistics_df.empty:
st.info("No data available for volcano plot.")
st.stop()
-volcano_df = pivot_df.copy()
-volcano_df = volcano_df.dropna(subset=["log2FC", "p-adj"])
-
-volcano_df["neg_log10_padj"] = -np.log10(volcano_df["p-adj"])
+# 2. Clean data and convert to Polars for component input
+# Drop missing values from the required 'log2FC' and 'p-adj' columns in `statistics_df`.
+volcano_df = statistics_df.dropna(subset=["log2FC", "p-adj"]).copy()
+# Convert the Pandas DataFrame to a Polars LazyFrame for component injection.
+volcano_pl_lazy = pl.from_pandas(volcano_df).lazy()
+# 3. Configure UI sliders (changing thresholds does not invalidate cache)
fc_thresh = st.slider(
"log2 Fold Change threshold",
min_value=0.5,
@@ -52,49 +58,34 @@
step=0.001,
)
-volcano_df["Significance"] = "Not significant"
-volcano_df.loc[
- (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh),
- "Significance",
-] = "Up-regulated"
-
-volcano_df.loc[
- (volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh),
- "Significance",
-] = "Down-regulated"
-
-fig_volcano = px.scatter(
- volcano_df,
- x="log2FC",
- y="neg_log10_padj",
- color="Significance",
- hover_data=["ProteinName", "log2FC", "p-value", "p-adj"],
- color_discrete_map={
- "Up-regulated": "red",
- "Down-regulated": "blue",
- "Not significant": "lightgrey",
- }
+# 4. Initialize the OpenMS-Insight VolcanoPlot component
+volcano_plot_component = VolcanoPlot(
+ cache_id="quantms_volcano_plot",
+ data=volcano_pl_lazy,
+ log2fc_column="log2FC",
+ pvalue_column="p-adj",
+ label_column="ProteinName",
+ up_color="#E74C3C",
+ down_color="#3498DB",
+ ns_color="#95A5A6",
+ show_threshold_lines=True,
+ threshold_line_style="dash",
)
-fig_volcano.add_vline(x=fc_thresh, line_dash="dash")
-fig_volcano.add_vline(x=-fc_thresh, line_dash="dash")
-fig_volcano.add_hline(y=-np.log10(p_thresh), line_dash="dash")
+# 5. Render the component
+state_manager = st.session_state.get("state") # Inject the project state management object
-# Make x-axis symmetric around zero
-max_abs_fc = volcano_df["log2FC"].abs().max()
-x_range = [-max_abs_fc * 1.1, max_abs_fc * 1.1] # 10% padding
-
-fig_volcano.update_layout(
- xaxis_title="log2 Fold Change",
- yaxis_title="-log10(p-adj)",
- xaxis_range=x_range,
+volcano_plot_component(
+ state_manager=state_manager,
+ fc_threshold=fc_thresh,
+ p_threshold=p_thresh,
+ max_labels=10, # Display labels for the top N significant proteins
height=600,
)
-st.plotly_chart(fig_volcano, use_container_width=True)
-
-up_count = (volcano_df["Significance"] == "Up-regulated").sum()
-down_count = (volcano_df["Significance"] == "Down-regulated").sum()
+# 6. Keep the existing statistical summary and bottom links
+up_count = ((volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] >= fc_thresh)).sum()
+down_count = ((volcano_df["p-adj"] <= p_thresh) & (volcano_df["log2FC"] <= -fc_thresh)).sum()
st.markdown(f"**Up-regulated:** {up_count} | **Down-regulated:** {down_count}")
st.markdown("---")
@@ -103,4 +94,4 @@
with col1:
st.page_link("content/results_pca.py", label="PCA", icon="π")
with col2:
- st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
+ st.page_link("content/results_heatmap.py", label="Heatmap", icon="π₯")
\ No newline at end of file
diff --git a/content/statistical.py b/content/statistical.py
new file mode 100644
index 0000000..97ef7ff
--- /dev/null
+++ b/content/statistical.py
@@ -0,0 +1,163 @@
+"""Statistical Inference Page."""
+
+from pathlib import Path
+import pandas as pd
+import polars as pl
+import streamlit as st
+from src.common.common import page_setup
+from src.common.results_helpers import get_abundance_data
+# Import statistics engine functions from openms_insight
+from openms_insight.analysis.statistics import calculate_statistical_tests, adjust_fdr_lazy
+
+params = page_setup()
+st.title("Statistical Inference")
+
+st.markdown(
+ """
+Run differential expression analysis to identify statistically significant proteins across your biological groups.
+"""
+)
+
+if "workspace" not in st.session_state:
+ st.warning("Please initialize your workspace first.")
+ st.stop()
+
+# Load primary database assets
+result = get_abundance_data(st.session_state["workspace"])
+if result is None:
+ st.info(
+ "Abundance data not available. Please run the workflow and configure sample groups first."
+ )
+ st.page_link(
+ "content/results_abundance.py", label="Go to Abundance", icon="π"
+ )
+ st.stop()
+
+pivot_df, group_map = result
+
+# --- STEP 1: Upstream Pipeline Tracker (Fallback Architecture) ---
+if (
+ "normalized_df" in st.session_state
+ and st.session_state["normalized_df"] is not None
+):
+ base_df = st.session_state["normalized_df"]
+ st.info(
+ "π **Upstream Pipeline Detected**: Using data processed from the **Normalization** step."
+ )
+elif (
+ "imputed_df" in st.session_state
+ and st.session_state["imputed_df"] is not None
+):
+ base_df = st.session_state["imputed_df"]
+ st.warning(
+ "β οΈ **Normalization Skipped**: Using data processed from the **Imputation** step."
+ )
+elif (
+ "filtered_df" in st.session_state
+ and st.session_state["filtered_df"] is not None
+):
+ base_df = st.session_state["filtered_df"]
+ st.warning(
+ "β οΈ **Preprocessing Skipped**: Using data processed from the **Filtering** step."
+ )
+else:
+ base_df = pivot_df
+ st.warning(
+ "β οΈ **Raw Input Active**: No preprocessing history found. Operating on the original table."
+ )
+
+# 2. Extract actual active sample columns and detect unique biological groups
+sample_cols = [
+ c for c in base_df.columns if c not in ["ProteinName", "PeptideSequence"]
+]
+unique_groups = sorted(list(set([group_map[s] for s in sample_cols])))
+group_count = len(unique_groups)
+
+# --- SECTION 1: Active Input Table Preview ---
+st.subheader("Input Table Overview")
+st.markdown(
+ f"Currently analyzing **{base_df.shape[0]}** rows across **{len(sample_cols)}** samples belonging to **{group_count} groups** ({', '.join(unique_groups)})."
+)
+st.dataframe(base_df, use_container_width=True)
+
+st.markdown("---")
+
+# --- SECTION 2: Dynamic Statistical Parameter Configuration ---
+st.subheader("Configure Statistical Engine")
+
+# Prepare structural Polars metadata DataFrame required by backend functions
+metadata_rows = [{"sample_id": s, "group": group_map[s]} for s in sample_cols]
+metadata_pl = pl.DataFrame(
+ metadata_rows, schema={"sample_id": pl.String, "group": pl.String}
+)
+
+col1, col2 = st.columns(2)
+
+with col1:
+ st.markdown("### π¬ 1. Hypothesis Testing Method")
+
+ # Route available method options dynamically based on the group count
+ if group_count == 2:
+ method_options = ["limma_like", "welch", "paired"]
+ help_text = "'limma_like' uses Empirical Bayes variance shrinking. 'welch' is for unequal variances. 'paired' is for dependent samples."
+ elif group_count >= 3:
+ method_options = ["limma_like", "anova"]
+ help_text = "'limma_like' supports multi-group design matrices. 'anova' computes standard row-wise One-way ANOVA."
+ else:
+ st.error("β Statistical testing requires at least 2 unique sample groups.")
+ st.stop()
+
+ selected_method = st.selectbox(
+ "Select Statistical Test",
+ options=method_options,
+ index=0,
+ help=help_text
+ )
+
+with col2:
+ st.markdown("### π‘οΈ 2. Multiple Testing Correction (FDR)")
+ selected_fdr = st.selectbox(
+ "Select FDR Adjustment Strategy",
+ options=["BH", "Bonferroni", "None"],
+ index=0,
+ help="'BH' (Benjamini-Hochberg) controls False Discovery Rate. 'Bonferroni' is strict Family-Wise Error Rate control."
+ )
+
+# --- SECTION 3: Statistical Query Execution ---
+st.markdown("
", unsafe_allow_html=True)
+if st.button("Run Statistical Analysis", type="primary"):
+
+ # Convert active pandas dataframe into polars lazyframe graph
+ stats_lazy = pl.from_pandas(base_df).lazy()
+
+ try:
+ # Execute Chain 1: Calculate core statistics (Adds log2FC, stat, p-value)
+ stats_lazy = calculate_statistical_tests(
+ quantification_data=stats_lazy,
+ metadata=metadata_pl,
+ method=selected_method
+ )
+
+ # Execute Chain 2: Adjust Multiple Testing (Adds p-adj)
+ stats_lazy = adjust_fdr_lazy(
+ quantification_data=stats_lazy,
+ strategy=selected_fdr
+ )
+
+ # Resolve lazy graph optimization tree and bring back to pandas memory
+ statistics_df = stats_lazy.collect().to_pandas()
+
+ # πΎ Save processing checkpoint inside Session State for Downstream (e.g., Volcano plot, Volcano/Heatmap UI)
+ st.session_state["statistics_df"] = statistics_df
+
+ st.success(f"Successfully calculated **{selected_method}** test with **{selected_fdr}** FDR correction!")
+
+ # Display the finalized statistics table view
+ st.subheader("Statistical Analysis Results")
+ st.markdown(f"Generated framework containing columns: `ProteinName`, `log2FC`, `stat`, `p-value`, `p-adj`, `PeptideSequence`")
+ st.dataframe(statistics_df, use_container_width=True)
+
+ except ValueError as val_err:
+ st.error(f"Engine Validation Fallure: {str(val_err)}")
+ except Exception as e:
+ st.error(f"An unexpected pipeline error occurred: {str(e)}")
\ No newline at end of file
diff --git a/quantms_protein_heatmap/manifest.json b/quantms_protein_heatmap/manifest.json
new file mode 100644
index 0000000..c896f9f
--- /dev/null
+++ b/quantms_protein_heatmap/manifest.json
@@ -0,0 +1,49 @@
+{
+ "version": 3,
+ "component_type": "heatmap",
+ "created_at": "2026-06-25T14:51:08.657547",
+ "config_hash": "17538bbd7c8bb0b83ef303c424c0f51e7dccd7b704b8b4d6f2f9b91555db0343",
+ "config": {
+ "x_column": "Sample",
+ "y_column": "ProteinName",
+ "intensity_column": "Z_score",
+ "min_points": 10000,
+ "display_aspect_ratio": 1.7777777777777777,
+ "x_bins": 188,
+ "y_bins": 106,
+ "use_simple_downsample": false,
+ "use_streaming": true,
+ "categorical_filters": [],
+ "zoom_identifier": "heatmap_zoom",
+ "title": "Protein Abundance Heatmap (Z-score)",
+ "x_label": "Samples",
+ "y_label": "Proteins",
+ "colorscale": "RdBu",
+ "category_column": null,
+ "log_scale": false,
+ "low_values_on_top": false,
+ "intensity_label": "Z-score"
+ },
+ "filters": {},
+ "filter_defaults": {},
+ "interactivity": {},
+ "data_files": {
+ "level_0": "level_0.parquet",
+ "level_1": "level_1.parquet"
+ },
+ "data_values": {
+ "x_range": [
+ "01_1",
+ "10_3"
+ ],
+ "y_range": [
+ "sp|Biognosys|iRT-Kit_WR_fusion",
+ "sp|Q12496|YO098_YEAST"
+ ],
+ "total": 300,
+ "level_sizes": [
+ 300
+ ],
+ "num_levels": 2
+ }
+}
\ No newline at end of file
diff --git a/quantms_protein_heatmap/preprocessed/level_0.parquet b/quantms_protein_heatmap/preprocessed/level_0.parquet
new file mode 100644
index 0000000..845428e
Binary files /dev/null and b/quantms_protein_heatmap/preprocessed/level_0.parquet differ
diff --git a/quantms_protein_heatmap/preprocessed/level_1.parquet b/quantms_protein_heatmap/preprocessed/level_1.parquet
new file mode 100644
index 0000000..845428e
Binary files /dev/null and b/quantms_protein_heatmap/preprocessed/level_1.parquet differ
diff --git a/quantms_volcano_plot/manifest.json b/quantms_volcano_plot/manifest.json
new file mode 100644
index 0000000..d572f61
--- /dev/null
+++ b/quantms_volcano_plot/manifest.json
@@ -0,0 +1,26 @@
+{
+ "version": 3,
+ "component_type": "volcanoplot",
+ "created_at": "2026-06-25T14:41:42.791178",
+ "config_hash": "b06bb8d4c9dc289910868a8f1d6fdb18e1deb7feaa21962787f7d334e30f5608",
+ "config": {
+ "log2fc_column": "log2FC",
+ "pvalue_column": "p-adj",
+ "label_column": "ProteinName",
+ "title": null,
+ "x_label": "log2 Fold Change",
+ "y_label": "-log10(p-value)",
+ "up_color": "#E74C3C",
+ "down_color": "#3498DB",
+ "ns_color": "#95A5A6",
+ "show_threshold_lines": true,
+ "threshold_line_style": "dash"
+ },
+ "filters": {},
+ "filter_defaults": {},
+ "interactivity": {},
+ "data_files": {
+ "volcanoData": "volcanoData.parquet"
+ },
+ "data_values": {}
+}
\ No newline at end of file
diff --git a/quantms_volcano_plot/preprocessed/volcanoData.parquet b/quantms_volcano_plot/preprocessed/volcanoData.parquet
new file mode 100644
index 0000000..6e2d7a2
Binary files /dev/null and b/quantms_volcano_plot/preprocessed/volcanoData.parquet differ
diff --git a/requirements.txt b/requirements.txt
index aac2879..a6b90e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -149,4 +149,5 @@ mygene
# Redis Queue dependencies (for online mode)
redis>=5.0.0
rq>=1.16.0
-statsmodels
\ No newline at end of file
+statsmodels
+polars
\ No newline at end of file
diff --git a/src/common/results_helpers.py b/src/common/results_helpers.py
index db3e103..d28fe57 100644
--- a/src/common/results_helpers.py
+++ b/src/common/results_helpers.py
@@ -9,6 +9,7 @@
from pyopenms import IdXMLFile, MSExperiment, MzMLFile
from src.workflow.ParameterManager import ParameterManager
from statsmodels.stats.multitest import multipletests
+from openms_insight.analysis.statistics import calculate_statistical_tests, adjust_fdr_lazy
def get_workflow_dir(workspace):
"""Get the workflow directory path."""
@@ -184,15 +185,14 @@ def build_spectra_cache(mzml_dir: Path, filename_to_index: dict) -> tuple[pl.Dat
@st.cache_data
-def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None:
- """Load CSV, compute stats (log2FC, p-value), build pivot_df and expr_df.
-
- Args:
- workspace_path: Path to the workspace directory
- csv_mtime: Modification time of CSV file (used as cache key)
-
- Returns:
- Tuple of (pivot_df, expr_df, group_map) or None if data unavailable
+def load_abundance_data(
+ workspace_path: str,
+ csv_mtime: float,
+) -> tuple | None:
+ """Load a long-format CSV, pivot it into a standard wide-format table
+
+ with sample intensity columns, and return the table along with the group
+ mapping.
"""
workflow_dir = get_workflow_dir(Path(workspace_path))
quant_dir = workflow_dir / "results" / "quant_results"
@@ -214,7 +214,7 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None:
if df.empty:
return None
- # Get group mapping from parameters
+ # 1. Extract group mapping information from the parameter JSON
param_manager = ParameterManager(workflow_dir)
params = param_manager.get_parameters_from_json()
group_map = {
@@ -223,67 +223,41 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None:
if key.startswith("mzML-group-") and value
}
+ st.write(f"group_map: {group_map}")
+
if not group_map:
return None
+ # 2. Extract sample names and map to groups
df["Sample"] = df["Reference"].str.replace(".mzML", "", regex=False)
df["Group"] = df["Reference"].map(group_map)
df = df.dropna(subset=["Group"])
groups = sorted(df["Group"].unique())
-
if len(groups) < 2:
return None
- group1, group2 = groups[:2]
-
- # Compute statistics per protein
- stats_rows = []
- for protein, protein_df in df.groupby("ProteinName"):
- g1_vals = protein_df[protein_df["Group"] == group1]["Intensity"].values
- g2_vals = protein_df[protein_df["Group"] == group2]["Intensity"].values
-
- if len(g1_vals) < 2 or len(g2_vals) < 2:
- pval = np.nan
- else:
- _, pval = ttest_ind(g1_vals, g2_vals, equal_var=False)
-
- mean_g1 = np.mean(g1_vals) if len(g1_vals) > 0 else np.nan
- mean_g2 = np.mean(g2_vals) if len(g2_vals) > 0 else np.nan
-
- log2fc = np.log2(mean_g2 / mean_g1) if mean_g1 > 0 else np.nan
-
- stats_rows.append({
- "ProteinName": protein,
- "log2FC": log2fc,
- "p-value": pval,
- })
-
- stats_df = pd.DataFrame(stats_rows)
-
- if not stats_df.empty:
- mask = stats_df["p-value"].notna()
- if mask.any():
- _, p_adj, _, _ = multipletests(stats_df.loc[mask, "p-value"], method="fdr_bh")
- stats_df.loc[mask, "p-adj"] = p_adj
- else:
- stats_df["p-adj"] = np.nan
-
- # Order samples by group (group2 first, then group1)
+ # 3. Define ordering and build sample arrays
sample_group_df = df[["Sample", "Group"]].drop_duplicates()
- group2_samples = sample_group_df[sample_group_df["Group"] == group2]["Sample"].tolist()
- group1_samples = sample_group_df[sample_group_df["Group"] == group1]["Sample"].tolist()
- all_samples = group2_samples + group1_samples
-
- # Build pivot table
+ group1_samples = sample_group_df[sample_group_df["Group"] == groups[0]][
+ "Sample"
+ ].tolist()
+ group2_samples = sample_group_df[sample_group_df["Group"] == groups[1]][
+ "Sample"
+ ].tolist()
+ all_samples = group1_samples + group2_samples
+
+ # 4. Convert from long to wide format (Pivot) and fill missing values with 0.0
pivot_list = []
for protein, group_df in df.groupby("ProteinName"):
peptides = ";".join(group_df["PeptideSequence"].unique())
intensity_dict = group_df.groupby("Sample")["Intensity"].sum().to_dict()
+
+ # Fill sample columns (use 0.0 if missing)
intensity_dict_complete = {
- sample: intensity_dict.get(sample, 0)
- for sample in all_samples
+ sample: intensity_dict.get(sample, 0.0) for sample in all_samples
}
+
row = {
"ProteinName": protein,
**intensity_dict_complete,
@@ -292,16 +266,20 @@ def load_abundance_data(workspace_path: str, csv_mtime: float) -> tuple | None:
pivot_list.append(row)
pivot_df = pd.DataFrame(pivot_list)
- pivot_df = pivot_df.merge(stats_df, on="ProteinName", how="left")
- pivot_df = pivot_df[["ProteinName", "log2FC", "p-value", "p-adj"] + all_samples + ["PeptideSequence"]]
- # Build expression matrix (log2-transformed)
- expr_df = pivot_df.set_index("ProteinName")[all_samples]
- expr_df = expr_df.replace(0, np.nan)
- expr_df = np.log2(expr_df + 1)
- expr_df = expr_df.dropna()
+ # 5. Reorder columns to match the required standard format
+ # Structure: [ProteinName, Sample_1, Sample_2, ..., PeptideSequence]
+ columns_order = ["ProteinName"] + all_samples + ["PeptideSequence"]
+ pivot_df = pivot_df[columns_order]
+
+ # 6. Clean up the group_map keys right before returning to the caller
+ clean_group_map = {}
+ for k, v in group_map.items():
+ clean_key = k[:-5] if k.endswith(".mzML") else k
+ clean_group_map[clean_key] = v
- return pivot_df, expr_df, group_map
+ # Return final results with the clean group map
+ return pivot_df, clean_group_map
def get_abundance_data(workspace: Path) -> tuple | None:
@@ -324,4 +302,4 @@ def get_abundance_data(workspace: Path) -> tuple | None:
return None
csv_mtime = csv_files[0].stat().st_mtime
- return load_abundance_data(str(workspace), csv_mtime)
+ return load_abundance_data(str(workspace), csv_mtime)
\ No newline at end of file