Hack90 commited on
Commit
d1f4671
·
verified ·
1 Parent(s): d9d2b55

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +114 -0
utils.py CHANGED
@@ -300,6 +300,120 @@ def wens_method_heatmap(df, virus_species):
300
 
301
 
302
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
 
304
 
305
  ############################################################# ColorSquare ########################################################
 
300
 
301
 
302
  return fig
303
+ ############################################################# Sub-Specie ########################################################
304
+ import numpy as np
305
+ from scipy.interpolate import interp1d, CubicSpline
306
+ import pandas as pd
307
+ from tqdm import tqdm
308
+
309
+ # Define constants
310
+ MIN_DISTANCE = 2581
311
+ VECTORS = {
312
+ 'A': [0.5, -0.8660254],
313
+ 'T': [0.5, 0.8660254],
314
+ 'G': [0.8660254, -0.5],
315
+ 'C': [0.8660254, 0.5]
316
+ }
317
+
318
+ def create_dna_representation_ew_subs(seq):
319
+ """Create a 2D representation of DNA sequence using cubic spline interpolation."""
320
+ # Clean the sequence
321
+ clean_seq = ''.join(char for char in seq if char in VECTORS)
322
+
323
+ # Convert sequence to numerical representation
324
+ num_seq = np.array([VECTORS[char] for char in clean_seq], dtype=float)
325
+
326
+ # Calculate cumulative sum
327
+ cum_sum = num_seq.cumsum(axis=0)
328
+
329
+ # Perform cubic spline interpolation
330
+ x = np.arange(len(cum_sum))
331
+ cs_x = CubicSpline(x, cum_sum[:, 0])
332
+ cs_y = CubicSpline(x, cum_sum[:, 1])
333
+
334
+ # Interpolate to 2048 points
335
+ x_new = np.linspace(0, len(cum_sum) - 1, 2048)
336
+ return np.column_stack([cs_x(x_new), cs_y(x_new)]).tolist()
337
+
338
+ def create_dna_representation_for_subs(row):
339
+ """Create a 1D representation of DNA sequence using linear interpolation."""
340
+ min_distance = int(row['min_distance'])
341
+ seq = ''.join(char for char in row['seq'] if char in VECTORS)[:min_distance]
342
+ min_distance = int(min_distance * 0.66)
343
+
344
+ # Convert sequence to numerical representation
345
+ num_seq = np.array([VECTORS[char] for char in seq], dtype=float)
346
+
347
+ # Calculate cumulative sum
348
+ cum_sum = num_seq.cumsum(axis=0)
349
+
350
+ # Perform linear interpolation
351
+ f = interp1d(cum_sum[:, 0], cum_sum[:, 1], kind='cubic', fill_value='extrapolate')
352
+ x_new = np.linspace(0, min_distance - 1, min_distance)
353
+ return f(x_new)
354
+
355
+ def create_groups_subs(closest_matches):
356
+ """Create groups based on closest matches."""
357
+ groups = {}
358
+ visited = set()
359
+
360
+ def dfs(node, group):
361
+ if node in visited:
362
+ return
363
+ visited.add(node)
364
+ group.add(node)
365
+ for neighbor in closest_matches[node]:
366
+ dfs(neighbor, group)
367
+
368
+ for i in range(len(closest_matches)):
369
+ if i not in visited:
370
+ group = set()
371
+ dfs(i, group)
372
+ if len(group) > 1: # Ignore elements with no closest match
373
+ groups[f"group_{len(groups) + 1}"] = sorted(list(group))
374
+
375
+ return groups
376
+
377
+ def process_data_sub_specie(df, species):
378
+ """Process DNA data for a given species."""
379
+ # Filter data for the given species
380
+ df_plot = df[df['organism_name'] == species].reset_index(drop=True).copy()
381
+
382
+ # Calculate median sequence length and filter sequences
383
+ median = df_plot['seq_len'].median() * 0.8
384
+ df_plot['min_distance'] = median
385
+ df_plot = df_plot[df_plot['seq_len'] > median].reset_index(drop=True)
386
+
387
+ # Create DNA representations
388
+ df_plot['two_d'] = df_plot.apply(create_dna_representation_for_subs, axis=1)
389
+ values = np.array(df_plot['two_d'].tolist())
390
+
391
+ # Calculate differences between sequences
392
+ n_rows = values.shape[0]
393
+ b_list = []
394
+
395
+ for i in tqdm(range(n_rows)):
396
+ diff = np.abs(values[i:i+1, :] - values).sum(axis=1)
397
+ b_list.append(diff)
398
+
399
+ bbbb = np.array(b_list)
400
+ print(bbbb)
401
+ np.fill_diagonal(bbbb, 10000)
402
+ median_filter = median * 3
403
+ maxxx = [np.where(bbbb[i] < median_filter)[0] for i in range(len(bbbb))]
404
+
405
+ # Create groups
406
+ groups = create_groups_subs(maxxx)
407
+
408
+ # Add group information to dataframe
409
+ df_plot['group'] = 'No Group'
410
+ for group_name, group_indices in groups.items():
411
+ df_plot.loc[group_indices, 'group'] = group_name
412
+
413
+ # Create 2D representations
414
+ df_plot['two_d'] = df_plot['seq'].apply(create_dna_representation_ew_subs)
415
+
416
+ return df_plot
417
 
418
 
419
  ############################################################# ColorSquare ########################################################