Christina Theodoris commited on
Commit
3072225
1 Parent(s): 98ce6d7

Add explanation of output columns and sort by largest shift

Browse files
geneformer/in_silico_perturber_stats.py CHANGED
@@ -142,12 +142,12 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
142
  names=["Gene",
143
  "Gene_name",
144
  "Ensembl_ID",
145
- "Shift_from_goal_end",
146
- "Shift_from_alt_end",
147
  "Goal_end_vs_random_pval",
148
  "Alt_end_vs_random_pval"]
149
  if alt_end_state_exists == False:
150
- names.remove("Shift_from_alt_end")
151
  names.remove("Alt_end_vs_random_pval")
152
  cos_sims_full_df = pd.DataFrame(columns=names)
153
 
@@ -197,8 +197,9 @@ def isp_stats_to_goal_state(cos_sims_df, dict_list, cell_states_to_model):
197
  cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
198
 
199
  # sort by shift to desired state
200
- cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_from_goal_end",
201
- "Goal_end_FDR"])
 
202
 
203
  return cos_sims_full_df
204
 
@@ -208,9 +209,9 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
208
 
209
  cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
210
  cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
211
- cos_sims_full_df["Test_v_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
212
- cos_sims_full_df["Test_v_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
213
- cos_sims_full_df["Test_v_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
214
  cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
215
  cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
216
 
@@ -227,17 +228,18 @@ def isp_stats_vs_null(cos_sims_df, dict_list, null_dict_list):
227
 
228
  cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
229
  cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
230
- cos_sims_full_df.loc[i, "Test_v_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
231
- cos_sims_full_df.loc[i, "Test_v_null_pval"] = ranksums(test_shifts,
232
  null_shifts, nan_policy="omit").pvalue
233
 
234
  cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
235
  cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
236
 
237
- cos_sims_full_df["Test_v_null_FDR"] = get_fdr(cos_sims_full_df["Test_v_null_pval"])
238
 
239
- cos_sims_full_df = cos_sims_full_df.sort_values(by=["Test_v_null_avg_shift",
240
- "Test_v_null_FDR"])
 
241
  return cos_sims_full_df
242
 
243
  # stats for identifying perturbations with largest effect within a given set of cells
@@ -498,6 +500,46 @@ class InSilicoPerturberStats:
498
  Path to directory where perturbation data will be saved as .csv
499
  output_prefix : str
500
  Prefix for output .dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
501
  """
502
 
503
  if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]:
 
142
  names=["Gene",
143
  "Gene_name",
144
  "Ensembl_ID",
145
+ "Shift_to_goal_end",
146
+ "Shift_to_alt_end",
147
  "Goal_end_vs_random_pval",
148
  "Alt_end_vs_random_pval"]
149
  if alt_end_state_exists == False:
150
+ names.remove("Shift_to_alt_end")
151
  names.remove("Alt_end_vs_random_pval")
152
  cos_sims_full_df = pd.DataFrame(columns=names)
153
 
 
197
  cos_sims_full_df["N_Detections"] = [n_detections(i, dict_list, "cell", None) for i in cos_sims_full_df["Gene"]]
198
 
199
  # sort by shift to desired state
200
+ cos_sims_full_df = cos_sims_full_df.sort_values(by=["Shift_to_goal_end",
201
+ "Goal_end_FDR"],
202
+ ascending=[False,True])
203
 
204
  return cos_sims_full_df
205
 
 
209
 
210
  cos_sims_full_df["Test_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
211
  cos_sims_full_df["Null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
212
+ cos_sims_full_df["Test_vs_null_avg_shift"] = np.zeros(cos_sims_df.shape[0], dtype=float)
213
+ cos_sims_full_df["Test_vs_null_pval"] = np.zeros(cos_sims_df.shape[0], dtype=float)
214
+ cos_sims_full_df["Test_vs_null_FDR"] = np.zeros(cos_sims_df.shape[0], dtype=float)
215
  cos_sims_full_df["N_Detections_test"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
216
  cos_sims_full_df["N_Detections_null"] = np.zeros(cos_sims_df.shape[0], dtype="uint32")
217
 
 
228
 
229
  cos_sims_full_df.loc[i, "Test_avg_shift"] = np.mean(test_shifts)
230
  cos_sims_full_df.loc[i, "Null_avg_shift"] = np.mean(null_shifts)
231
+ cos_sims_full_df.loc[i, "Test_vs_null_avg_shift"] = np.mean(test_shifts)-np.mean(null_shifts)
232
+ cos_sims_full_df.loc[i, "Test_vs_null_pval"] = ranksums(test_shifts,
233
  null_shifts, nan_policy="omit").pvalue
234
 
235
  cos_sims_full_df.loc[i, "N_Detections_test"] = len(test_shifts)
236
  cos_sims_full_df.loc[i, "N_Detections_null"] = len(null_shifts)
237
 
238
+ cos_sims_full_df["Test_vs_null_FDR"] = get_fdr(cos_sims_full_df["Test_vs_null_pval"])
239
 
240
+ cos_sims_full_df = cos_sims_full_df.sort_values(by=["Test_vs_null_avg_shift",
241
+ "Test_vs_null_FDR"],
242
+ ascending=[False,True])
243
  return cos_sims_full_df
244
 
245
  # stats for identifying perturbations with largest effect within a given set of cells
 
500
  Path to directory where perturbation data will be saved as .csv
501
  output_prefix : str
502
  Prefix for output .dataset
503
+
504
+ Outputs
505
+ ----------
506
+ Definition of possible columns in .csv output file.
507
+
508
+ Of note, not all columns will be present in all output files.
509
+ Some columns are specific to particular perturbation modes.
510
+
511
+ "Gene": gene token
512
+ "Gene_name": gene name
513
+ "Ensembl_ID": gene Ensembl ID
514
+ "N_Detections": number of cells in which each gene or gene combination was detected in the input dataset
515
+
516
+ "Shift_to_goal_end": cosine shift from start state towards goal end state in response to given perturbation
517
+ "Shift_to_alt_end": cosine shift from start state towards alternate end state in response to given perturbation
518
+ "Goal_end_vs_random_pval": pvalue of cosine shift from start state towards goal end state by Wilcoxon
519
+ pvalue compares shift caused by perturbing given gene compared to random genes
520
+ "Alt_end_vs_random_pval": pvalue of cosine shift from start state towards alternate end state by Wilcoxon
521
+ pvalue compares shift caused by perturbing given gene compared to random genes
522
+ "Goal_end_FDR": Benjamini-Hochberg correction of "Goal_end_vs_random_pval"
523
+ "Alt_end_FDR": Benjamini-Hochberg correction of "Alt_end_vs_random_pval"
524
+
525
+ "Test_avg_shift": cosine shift in response to given perturbation in cells from test distribution
526
+ "Null_avg_shift": cosine shift in response to given perturbation in cells from null distribution (e.g. random cells)
527
+ "Test_vs_null_avg_shift": difference in cosine shift in cells from test vs. null distribution
528
+ (i.e. "Test_avg_shift" minus "Null_avg_shift")
529
+ "Test_vs_null_pval": pvalue of cosine shift in test vs. null distribution
530
+ "Test_vs_null_FDR": Benjamini-Hochberg correction of "Test_vs_null_pval"
531
+ "N_Detections_test": "N_Detections" in cells from test distribution
532
+ "N_Detections_null": "N_Detections" in cells from null distribution
533
+
534
+ "Anchor_shift": cosine shift in response to given perturbation of anchor gene
535
+ "Test_token_shift": cosine shift in response to given perturbation of test gene
536
+ "Sum_of_indiv_shifts": sum of cosine shifts in response to individually perturbing test and anchor genes
537
+ "Combo_shift": cosine shift in response to given perturbation of both anchor and test gene(s) in combination
538
+ "Combo_minus_sum_shift": difference of cosine shifts in response combo perturbation vs. sum of individual perturbations
539
+ (i.e. "Combo_shift" minus "Sum_of_indiv_shifts")
540
+ "Impact_component": whether the given perturbation was modeled to be within the impact component by the mixture model
541
+ 1: within impact component; 0: not within impact component
542
+ "Impact_component_percent": percent of cells in which given perturbation was modeled to be within impact component
543
  """
544
 
545
  if self.mode not in ["goal_state_shift", "vs_null", "mixture_model"]: