Spaces:

InstaDeepAI
/

InstaNovo

Running on Zero

App Files Files Community

BioGeek commited on Mar 27

Commit

e38f067

2 Parent(s): 6851f02 44b2355

Merge branch 'citation'

Browse files

Files changed (1) hide show

app.py +140 -63

app.py CHANGED Viewed

@@ -28,13 +28,15 @@ except ImportError as e:
     raise ImportError(f"Failed to import InstaNovo components: {e}")
 # --- Configuration ---
-MODEL_ID = "instanovo-v1.1.0" # Use the desired pretrained model ID
 KNAPSACK_DIR = Path("./knapsack_cache")
-DEFAULT_CONFIG_PATH = Path("./configs/inference/default.yaml") # Assuming instanovo installs configs locally relative to execution
 # Determine device
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-FP16 = DEVICE == "cuda" # Enable FP16 only on CUDA
 # --- Global Variables (Load Model and Knapsack Once) ---
 MODEL: InstaNovo | None = None
@@ -78,9 +80,9 @@ def load_model_and_knapsack():
     # --- Knapsack Handling ---
     knapsack_exists = (
-        (KNAPSACK_DIR / "parameters.pkl").exists() and
-        (KNAPSACK_DIR / "masses.npy").exists() and
-        (KNAPSACK_DIR / "chart.npy").exists()
     )
     if knapsack_exists:
@@ -96,11 +98,15 @@ def load_model_and_knapsack():
     if not knapsack_exists:
         logger.info("Knapsack not found or failed to load. Generating knapsack...")
         if RESIDUE_SET is None:
-             raise gr.Error("Cannot generate knapsack because ResidueSet failed to load.")
         try:
             # Prepare residue masses for knapsack generation (handle negative/zero masses)
             residue_masses_knapsack = dict(RESIDUE_SET.residue_masses.copy())
-            negative_residues = [k for k, v in residue_masses_knapsack.items() if v <= 0]
             if negative_residues:
                 logger.info(f"Warning: Non-positive masses found in residues: {negative_residues}. "
                       "Excluding from knapsack generation.")
@@ -108,19 +114,19 @@ def load_model_and_knapsack():
                     del residue_masses_knapsack[res]
             # Remove special tokens explicitly if they somehow got mass
             for special_token in RESIDUE_SET.special_tokens:
-                 if special_token in residue_masses_knapsack:
-                     del residue_masses_knapsack[special_token]
             # Ensure residue indices used match those without special/negative masses
             valid_residue_indices = {
-                res: idx for res, idx in RESIDUE_SET.residue_to_index.items()
                 if res in residue_masses_knapsack
             }
             KNAPSACK = Knapsack.construct_knapsack(
                 residue_masses=residue_masses_knapsack,
-                residue_indices=valid_residue_indices, # Use only valid indices
                 max_mass=MAX_MASS,
                 mass_scale=MASS_SCALE,
             )
@@ -135,6 +141,7 @@ def load_model_and_knapsack():
 # Load the model and knapsack when the script starts
 load_model_and_knapsack()
 def create_inference_config(
     input_path: str,
     output_path: str,
@@ -143,7 +150,7 @@ def create_inference_config(
     """Creates the OmegaConf DictConfig needed for prediction."""
     # Load default config if available, otherwise create from scratch
     if DEFAULT_CONFIG_PATH.exists():
-         base_cfg = OmegaConf.load(DEFAULT_CONFIG_PATH)
     else:
          logger.info(f"Warning: Default config not found at {DEFAULT_CONFIG_PATH}. Using minimal config.")
          # Create a minimal config if default is missing
@@ -206,7 +213,9 @@ def create_inference_config(
         cfg_overrides["use_knapsack"] = False
     elif "Knapsack" in decoding_method:
         if KNAPSACK is None:
-            raise gr.Error("Knapsack is not available. Cannot use Knapsack Beam Search.")
         cfg_overrides["num_beams"] = 5
         cfg_overrides["use_knapsack"] = True
         cfg_overrides["knapsack_path"] = str(KNAPSACK_DIR)
@@ -223,9 +232,9 @@ def predict_peptides(input_file, decoding_method):
     Main function to load data, run prediction, and return results.
     """
     if MODEL is None or RESIDUE_SET is None or MODEL_CONFIG is None:
-         load_model_and_knapsack() # Attempt to reload if None (e.g., after space restart)
-         if MODEL is None:
-              raise gr.Error("InstaNovo model is not loaded. Cannot perform prediction.")
     if input_file is None:
         raise gr.Error("Please upload a mass spectrometry file.")
@@ -248,17 +257,18 @@ def predict_peptides(input_file, decoding_method):
         try:
             sdf = SpectrumDataFrame.load(
                 config.data_path,
-                lazy=False, # Load eagerly for Gradio simplicity
-                is_annotated=False, # De novo mode
                 column_mapping=config.get("column_map", None),
                 shuffle=False,
-                verbose=True # Print loading logs
             )
             # Apply charge filter like in CLI
             original_size = len(sdf)
             max_charge = config.get("max_charge", 10)
             sdf.filter_rows(
-                lambda row: (row["precursor_charge"] <= max_charge) and (row["precursor_charge"] > 0)
             )
             if len(sdf) < original_size:
                 logger.info(f"Warning: Filtered {original_size - len(sdf)} spectra with charge > {max_charge} or <= 0.")
@@ -275,16 +285,17 @@ def predict_peptides(input_file, decoding_method):
             sdf,
             RESIDUE_SET,
             MODEL_CONFIG.get("n_peaks", 200),
-            return_str=True, # Needed for greedy/beam search targets later (though not used here)
             annotated=False,
-            pad_spectrum_max_length=config.get("compile_model", False) or config.get("use_flash_attention", False),
             bin_spectra=config.get("conv_peak_encoder", False),
         )
         dl = DataLoader(
             ds,
             batch_size=config.batch_size,
-            num_workers=0, # Required by SpectrumDataFrame
-            shuffle=False, # Required by SpectrumDataFrame
             collate_fn=collate_batch,
         )
@@ -293,8 +304,10 @@ def predict_peptides(input_file, decoding_method):
         decoder: Decoder
         if config.use_knapsack:
             if KNAPSACK is None:
-                 # This check should ideally be earlier, but double-check
-                 raise gr.Error("Knapsack is required for Knapsack Beam Search but is not available.")
             # KnapsackBeamSearchDecoder doesn't directly load from path in this version?
             # We load Knapsack globally, so just pass it.
             # If it needed path: decoder = KnapsackBeamSearchDecoder.from_file(model=MODEL, path=config.knapsack_path)
@@ -316,15 +329,22 @@ def predict_peptides(input_file, decoding_method):
         # 5. Run Prediction Loop (Adapted from instanovo/transformer/predict.py)
         logger.info("Starting prediction...")
         start_time = time.time()
-        results_list: list[ScoredSequence | list] = [] # Store ScoredSequence or empty list
         for i, batch in enumerate(dl):
-            spectra, precursors, spectra_mask, _, _ = batch # Ignore peptides/masks for de novo
             spectra = spectra.to(DEVICE)
             precursors = precursors.to(DEVICE)
             spectra_mask = spectra_mask.to(DEVICE)
-            with torch.no_grad(), torch.amp.autocast(DEVICE, dtype=torch.float16, enabled=FP16):
                 # Beam search decoder might return list[list[ScoredSequence]] if return_beam=True
                 # Greedy decoder returns list[ScoredSequence]
                 # KnapsackBeamSearchDecoder returns list[ScoredSequence] or list[list[ScoredSequence]]
@@ -334,9 +354,12 @@ def predict_peptides(input_file, decoding_method):
                     beam_size=config.num_beams,
                     max_length=config.max_length,
                     # Knapsack/Beam Search specific params if needed
-                    mass_tolerance=config.get("filter_precursor_ppm", 20) * 1e-6, # Convert ppm to relative
-                    max_isotope=config.isotope_error_range[1] if config.isotope_error_range else 1,
-                    return_beam=False # Only get the top prediction for simplicity
                 )
             results_list.extend(batch_predictions) # Should be list[ScoredSequence] or list[list]
             logger.info(f"Processed batch {i+1}/{len(dl)}")
@@ -349,26 +372,30 @@ def predict_peptides(input_file, decoding_method):
         output_data = []
         # Use sdf index columns + prediction results
         index_cols = [col for col in config.index_columns if col in sdf.df.columns]
-        base_df_pd = sdf.df.select(index_cols).to_pandas() # Get base info
         metrics_calc = Metrics(RESIDUE_SET, config.isotope_error_range)
         for i, res in enumerate(results_list):
-            row_data = base_df_pd.iloc[i].to_dict() # Get corresponding input data
             if isinstance(res, ScoredSequence) and res.sequence:
                 sequence_str = "".join(res.sequence)
                 row_data["prediction"] = sequence_str
                 row_data["log_probability"] = f"{res.sequence_log_probability:.4f}"
                 # Use metrics to calculate delta mass ppm for the top prediction
                 try:
-                     _, delta_mass_list = metrics_calc.matches_precursor(
-                         res.sequence,
-                         row_data["precursor_mz"],
-                         row_data["precursor_charge"]
-                     )
-                     # Find the smallest absolute ppm error across isotopes
-                     min_abs_ppm = min(abs(p) for p in delta_mass_list) if delta_mass_list else float('nan')
-                     row_data["delta_mass_ppm"] = f"{min_abs_ppm:.2f}"
                 except Exception as e:
                      logger.info(f"Warning: Could not calculate delta mass for prediction {i}: {e}")
                      row_data["delta_mass_ppm"] = "N/A"
@@ -382,7 +409,14 @@ def predict_peptides(input_file, decoding_method):
         output_df = pl.DataFrame(output_data)
         # Ensure specific columns are present and ordered
-        display_cols = ["scan_number", "precursor_mz", "precursor_charge", "prediction", "log_probability", "delta_mass_ppm"]
         final_display_cols = []
         for col in display_cols:
             if col in output_df.columns:
@@ -397,7 +431,6 @@ def predict_peptides(input_file, decoding_method):
         output_df_display = output_df.select(final_display_cols)
         # 7. Save full results to CSV
         logger.info(f"Saving results to {output_csv_path}...")
         output_df.write_csv(output_csv_path)
@@ -413,6 +446,7 @@ def predict_peptides(input_file, decoding_method):
         # Re-raise as Gradio error
         raise gr.Error(f"Prediction failed: {e}")
 # --- Gradio Interface ---
 css = """
 .gradio-container { font-family: sans-serif; }
@@ -422,7 +456,9 @@ footer { display: none !important; }
 .logo-container img { margin-bottom: 1rem; }
 """
-with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")) as demo:
     # --- Logo Display ---
     gr.Markdown(
         """
@@ -430,7 +466,7 @@ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="blue", secondary_hu
           <img src='/gradio_api/file=assets/instanovo.svg' alt="InstaNovo Logo" width="300" style="display: block; margin: 0 auto;">
         </div>
         """,
-        elem_classes="logo-container" # Optional class for CSS targeting
     )
     # --- App Content ---
@@ -445,38 +481,57 @@ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="blue", secondary_hu
         with gr.Column(scale=1):
             input_file = gr.File(
                 label="Upload Mass Spectrometry File (.mgf, .mzml, .mzxml)",
-                file_types=[".mgf", ".mzml", ".mzxml"]
             )
             decoding_method = gr.Radio(
-                ["Greedy Search (Fast, resonably accurate)", "Knapsack Beam Search (More accurate, but slower)"],
                 label="Decoding Method",
-                value="Greedy Search (Fast, resonably accurate)" # Default to fast method
             )
             submit_btn = gr.Button("Predict Sequences", variant="primary")
         with gr.Column(scale=2):
-            output_df = gr.DataFrame(label="Prediction Results", headers=["scan_number", "precursor_mz", "precursor_charge", "prediction", "log_probability", "delta_mass_ppm"], wrap=True)
             output_file = gr.File(label="Download Full Results (CSV)")
     submit_btn.click(
         predict_peptides,
         inputs=[input_file, decoding_method],
-        outputs=[output_df, output_file]
     )
     gr.Examples(
-         [["assets/sample_spectra.mgf", "Greedy Search (Fast, resonably accurate)" ],
-         ["assets/sample_spectra.mgf", "Knapsack Beam Search (More accurate, but slower)" ]],
-         inputs=[input_file, decoding_method],
-         outputs=[output_df, output_file],
-         fn=predict_peptides,
-         cache_examples=False, # Re-run examples if needed
-         label="Example Usage"
     )
     gr.Markdown(
-         """
          **Notes:**
-         *   Predictions are based on the [InstaNovo](https://github.com/instadeepai/InstaNovo) model ({MODEL_ID}).
          *   Knapsack Beam Search uses pre-calculated mass constraints and yields better results but takes longer.
          *   `delta_mass_ppm` shows the lowest absolute precursor mass error (in ppm) across potential isotopes (0-1 neutron).
          *   Ensure your input file format is correctly specified. Large files may take time to process.
@@ -487,10 +542,32 @@ with gr.Blocks(css=css, theme=gr.themes.Default(primary_hue="blue", secondary_hu
     with gr.Accordion("Application Logs", open=True):
         log_display = Log(log_file, dark=True, height=300)
 # --- Launch the App ---
 if __name__ == "__main__":
     # Set share=True for temporary public link if running locally
     # Set server_name="0.0.0.0" to allow access from network if needed
     # demo.launch(server_name="0.0.0.0", server_port=7860)
     # For Hugging Face Spaces, just demo.launch() is usually sufficient
-    demo.launch(share=True) # For local testing with public URL

     raise ImportError(f"Failed to import InstaNovo components: {e}")
 # --- Configuration ---
+MODEL_ID = "instanovo-v1.1.0"  # Use the desired pretrained model ID
 KNAPSACK_DIR = Path("./knapsack_cache")
+DEFAULT_CONFIG_PATH = Path(
+    "./configs/inference/default.yaml"
+)  # Assuming instanovo installs configs locally relative to execution
 # Determine device
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+FP16 = DEVICE == "cuda"  # Enable FP16 only on CUDA
 # --- Global Variables (Load Model and Knapsack Once) ---
 MODEL: InstaNovo | None = None
     # --- Knapsack Handling ---
     knapsack_exists = (
+        (KNAPSACK_DIR / "parameters.pkl").exists()
+        and (KNAPSACK_DIR / "masses.npy").exists()
+        and (KNAPSACK_DIR / "chart.npy").exists()
     )
     if knapsack_exists:
     if not knapsack_exists:
         logger.info("Knapsack not found or failed to load. Generating knapsack...")
         if RESIDUE_SET is None:
+            raise gr.Error(
+                "Cannot generate knapsack because ResidueSet failed to load."
+            )
         try:
             # Prepare residue masses for knapsack generation (handle negative/zero masses)
             residue_masses_knapsack = dict(RESIDUE_SET.residue_masses.copy())
+            negative_residues = [
+                k for k, v in residue_masses_knapsack.items() if v <= 0
+            ]
             if negative_residues:
                 logger.info(f"Warning: Non-positive masses found in residues: {negative_residues}. "
                       "Excluding from knapsack generation.")
                     del residue_masses_knapsack[res]
             # Remove special tokens explicitly if they somehow got mass
             for special_token in RESIDUE_SET.special_tokens:
+                if special_token in residue_masses_knapsack:
+                    del residue_masses_knapsack[special_token]
             # Ensure residue indices used match those without special/negative masses
             valid_residue_indices = {
+                res: idx
+                for res, idx in RESIDUE_SET.residue_to_index.items()
                 if res in residue_masses_knapsack
             }
             KNAPSACK = Knapsack.construct_knapsack(
                 residue_masses=residue_masses_knapsack,
+                residue_indices=valid_residue_indices,  # Use only valid indices
                 max_mass=MAX_MASS,
                 mass_scale=MASS_SCALE,
             )
 # Load the model and knapsack when the script starts
 load_model_and_knapsack()
 def create_inference_config(
     input_path: str,
     output_path: str,
     """Creates the OmegaConf DictConfig needed for prediction."""
     # Load default config if available, otherwise create from scratch
     if DEFAULT_CONFIG_PATH.exists():
+        base_cfg = OmegaConf.load(DEFAULT_CONFIG_PATH)
     else:
          logger.info(f"Warning: Default config not found at {DEFAULT_CONFIG_PATH}. Using minimal config.")
          # Create a minimal config if default is missing
         cfg_overrides["use_knapsack"] = False
     elif "Knapsack" in decoding_method:
         if KNAPSACK is None:
+            raise gr.Error(
+                "Knapsack is not available. Cannot use Knapsack Beam Search."
+            )
         cfg_overrides["num_beams"] = 5
         cfg_overrides["use_knapsack"] = True
         cfg_overrides["knapsack_path"] = str(KNAPSACK_DIR)
     Main function to load data, run prediction, and return results.
     """
     if MODEL is None or RESIDUE_SET is None or MODEL_CONFIG is None:
+        load_model_and_knapsack()  # Attempt to reload if None (e.g., after space restart)
+        if MODEL is None:
+            raise gr.Error("InstaNovo model is not loaded. Cannot perform prediction.")
     if input_file is None:
         raise gr.Error("Please upload a mass spectrometry file.")
         try:
             sdf = SpectrumDataFrame.load(
                 config.data_path,
+                lazy=False,  # Load eagerly for Gradio simplicity
+                is_annotated=False,  # De novo mode
                 column_mapping=config.get("column_map", None),
                 shuffle=False,
+                verbose=True,  # Print loading logs
             )
             # Apply charge filter like in CLI
             original_size = len(sdf)
             max_charge = config.get("max_charge", 10)
             sdf.filter_rows(
+                lambda row: (row["precursor_charge"] <= max_charge)
+                and (row["precursor_charge"] > 0)
             )
             if len(sdf) < original_size:
                 logger.info(f"Warning: Filtered {original_size - len(sdf)} spectra with charge > {max_charge} or <= 0.")
             sdf,
             RESIDUE_SET,
             MODEL_CONFIG.get("n_peaks", 200),
+            return_str=True,  # Needed for greedy/beam search targets later (though not used here)
             annotated=False,
+            pad_spectrum_max_length=config.get("compile_model", False)
+            or config.get("use_flash_attention", False),
             bin_spectra=config.get("conv_peak_encoder", False),
         )
         dl = DataLoader(
             ds,
             batch_size=config.batch_size,
+            num_workers=0,  # Required by SpectrumDataFrame
+            shuffle=False,  # Required by SpectrumDataFrame
             collate_fn=collate_batch,
         )
         decoder: Decoder
         if config.use_knapsack:
             if KNAPSACK is None:
+                # This check should ideally be earlier, but double-check
+                raise gr.Error(
+                    "Knapsack is required for Knapsack Beam Search but is not available."
+                )
             # KnapsackBeamSearchDecoder doesn't directly load from path in this version?
             # We load Knapsack globally, so just pass it.
             # If it needed path: decoder = KnapsackBeamSearchDecoder.from_file(model=MODEL, path=config.knapsack_path)
         # 5. Run Prediction Loop (Adapted from instanovo/transformer/predict.py)
         logger.info("Starting prediction...")
         start_time = time.time()
+        results_list: list[
+            ScoredSequence | list
+        ] = []  # Store ScoredSequence or empty list
         for i, batch in enumerate(dl):
+            spectra, precursors, spectra_mask, _, _ = (
+                batch  # Ignore peptides/masks for de novo
+            )
             spectra = spectra.to(DEVICE)
             precursors = precursors.to(DEVICE)
             spectra_mask = spectra_mask.to(DEVICE)
+            with (
+                torch.no_grad(),
+                torch.amp.autocast(DEVICE, dtype=torch.float16, enabled=FP16),
+            ):
                 # Beam search decoder might return list[list[ScoredSequence]] if return_beam=True
                 # Greedy decoder returns list[ScoredSequence]
                 # KnapsackBeamSearchDecoder returns list[ScoredSequence] or list[list[ScoredSequence]]
                     beam_size=config.num_beams,
                     max_length=config.max_length,
                     # Knapsack/Beam Search specific params if needed
+                    mass_tolerance=config.get("filter_precursor_ppm", 20)
+                    * 1e-6,  # Convert ppm to relative
+                    max_isotope=config.isotope_error_range[1]
+                    if config.isotope_error_range
+                    else 1,
+                    return_beam=False,  # Only get the top prediction for simplicity
                 )
             results_list.extend(batch_predictions) # Should be list[ScoredSequence] or list[list]
             logger.info(f"Processed batch {i+1}/{len(dl)}")
         output_data = []
         # Use sdf index columns + prediction results
         index_cols = [col for col in config.index_columns if col in sdf.df.columns]
+        base_df_pd = sdf.df.select(index_cols).to_pandas()  # Get base info
         metrics_calc = Metrics(RESIDUE_SET, config.isotope_error_range)
         for i, res in enumerate(results_list):
+            row_data = base_df_pd.iloc[i].to_dict()  # Get corresponding input data
             if isinstance(res, ScoredSequence) and res.sequence:
                 sequence_str = "".join(res.sequence)
                 row_data["prediction"] = sequence_str
                 row_data["log_probability"] = f"{res.sequence_log_probability:.4f}"
                 # Use metrics to calculate delta mass ppm for the top prediction
                 try:
+                    _, delta_mass_list = metrics_calc.matches_precursor(
+                        res.sequence,
+                        row_data["precursor_mz"],
+                        row_data["precursor_charge"],
+                    )
+                    # Find the smallest absolute ppm error across isotopes
+                    min_abs_ppm = (
+                        min(abs(p) for p in delta_mass_list)
+                        if delta_mass_list
+                        else float("nan")
+                    )
+                    row_data["delta_mass_ppm"] = f"{min_abs_ppm:.2f}"
                 except Exception as e:
                      logger.info(f"Warning: Could not calculate delta mass for prediction {i}: {e}")
                      row_data["delta_mass_ppm"] = "N/A"
         output_df = pl.DataFrame(output_data)
         # Ensure specific columns are present and ordered
+        display_cols = [
+            "scan_number",
+            "precursor_mz",
+            "precursor_charge",
+            "prediction",
+            "log_probability",
+            "delta_mass_ppm",
+        ]
         final_display_cols = []
         for col in display_cols:
             if col in output_df.columns:
         output_df_display = output_df.select(final_display_cols)
         # 7. Save full results to CSV
         logger.info(f"Saving results to {output_csv_path}...")
         output_df.write_csv(output_csv_path)
         # Re-raise as Gradio error
         raise gr.Error(f"Prediction failed: {e}")
 # --- Gradio Interface ---
 css = """
 .gradio-container { font-family: sans-serif; }
 .logo-container img { margin-bottom: 1rem; }
 """
+with gr.Blocks(
+    css=css, theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")
+) as demo:
     # --- Logo Display ---
     gr.Markdown(
         """
           <img src='/gradio_api/file=assets/instanovo.svg' alt="InstaNovo Logo" width="300" style="display: block; margin: 0 auto;">
         </div>
         """,
+        elem_classes="logo-container",  # Optional class for CSS targeting
     )
     # --- App Content ---
         with gr.Column(scale=1):
             input_file = gr.File(
                 label="Upload Mass Spectrometry File (.mgf, .mzml, .mzxml)",
+                file_types=[".mgf", ".mzml", ".mzxml"],
             )
             decoding_method = gr.Radio(
+                [
+                    "Greedy Search (Fast, resonably accurate)",
+                    "Knapsack Beam Search (More accurate, but slower)",
+                ],
                 label="Decoding Method",
+                value="Greedy Search (Fast, resonably accurate)",  # Default to fast method
             )
             submit_btn = gr.Button("Predict Sequences", variant="primary")
         with gr.Column(scale=2):
+            output_df = gr.DataFrame(
+                label="Prediction Results",
+                headers=[
+                    "scan_number",
+                    "precursor_mz",
+                    "precursor_charge",
+                    "prediction",
+                    "log_probability",
+                    "delta_mass_ppm",
+                ],
+                wrap=True,
+            )
             output_file = gr.File(label="Download Full Results (CSV)")
     submit_btn.click(
         predict_peptides,
         inputs=[input_file, decoding_method],
+        outputs=[output_df, output_file],
     )
     gr.Examples(
+        [
+            ["assets/sample_spectra.mgf", "Greedy Search (Fast, resonably accurate)"],
+            [
+                "assets/sample_spectra.mgf",
+                "Knapsack Beam Search (More accurate, but slower)",
+            ],
+        ],
+        inputs=[input_file, decoding_method],
+        outputs=[output_df, output_file],
+        fn=predict_peptides,
+        cache_examples=False,  # Re-run examples if needed
+        label="Example Usage",
     )
     gr.Markdown(
+        """
          **Notes:**
+         *   Predictions are based on the [InstaNovo](https://github.com/instadeepai/InstaNovo) model `{MODEL_ID}`.
          *   Knapsack Beam Search uses pre-calculated mass constraints and yields better results but takes longer.
          *   `delta_mass_ppm` shows the lowest absolute precursor mass error (in ppm) across potential isotopes (0-1 neutron).
          *   Ensure your input file format is correctly specified. Large files may take time to process.
     with gr.Accordion("Application Logs", open=True):
         log_display = Log(log_file, dark=True, height=300)
+    gr.Textbox(
+        value="""
+@article{eloff_kalogeropoulos_2025_instanovo,
+	title        = {InstaNovo enables diffusion-powered de novo peptide sequencing in large-scale proteomics experiments},
+	author       = {Kevin Eloff and Konstantinos Kalogeropoulos and Amandla Mabona and Oliver Morell and Rachel Catzel and
+                    Esperanza Rivera-de-Torre and Jakob Berg Jespersen and Wesley Williams and Sam P. B. van Beljouw and
+                    Marcin J. Skwark and Andreas Hougaard Laustsen and Stan J. J. Brouns and Anne Ljungars and Erwin M.
+                    Schoof and Jeroen Van Goey and Ulrich auf dem Keller and Karim Beguir and Nicolas Lopez Carranza and
+                    Timothy P. Jenkins},
+	year         = 2025,
+	month        = {Mar},
+	day          = 31,
+	journal      = {Nature Machine Intelligence},
+	doi          = {10.1038/s42256-025-01019-5},
+	url          = {https://www.nature.com/articles/s42256-025-01019-5}
+}
+""",
+        show_copy_button=True,
+        label="If you use InstaNovo in your research, please cite:",
+        interactive=False,
+    )
 # --- Launch the App ---
 if __name__ == "__main__":
     # Set share=True for temporary public link if running locally
     # Set server_name="0.0.0.0" to allow access from network if needed
     # demo.launch(server_name="0.0.0.0", server_port=7860)
     # For Hugging Face Spaces, just demo.launch() is usually sufficient
+    demo.launch(share=True)  # For local testing with public URL