File size: 3,320 Bytes
7c46397
 
 
 
 
 
 
 
a1af661
 
7c46397
 
 
 
a1af661
7c46397
a1af661
 
 
 
 
7c46397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1af661
7c46397
 
a1af661
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
import gradio as gr
import pandas as pd
import os
import subprocess

def predict_top_100_genes(disease_id):
    # Initialize paths
    input_csv_path = 'data/downstream/{}_disease.csv'.format(disease_id)
    output_csv_path = 'data/downstream/{}_top100.csv'.format(disease_id)
    
    # Check if the output CSV already exists
    if not os.path.exists(output_csv_path):
        # Proceed with your existing code if the output file doesn't exist
        df = pd.read_csv('data/pretrain/disgenet_latest.csv')
        df = df[df['proteinSeq'].notna()]

        # Check if the disease_id is present in the dataframe
        if disease_id not in df['diseaseId'].values:
            return f"Error: Disease ID '{disease_id}' not found in the database. Please check the ID and try again."

        desired_diseaseDes = df[df['diseaseId'] == disease_id]['diseaseDes'].iloc[0]
        related_proteins = df[df['diseaseDes'] == desired_diseaseDes]['proteinSeq'].unique()
        df['score'] = df['proteinSeq'].isin(related_proteins).astype(int)
        new_df = pd.DataFrame({
            'diseaseId': disease_id,
            'diseaseDes': desired_diseaseDes,
            'geneSymbol': df['geneSymbol'],
            'proteinSeq': df['proteinSeq'],
            'score': df['score']
        }).drop_duplicates().reset_index(drop=True)
        
        new_df.to_csv(input_csv_path, index=False)
        
        # Call the model script only if the output CSV does not exist
        script_path = 'model.sh'
        subprocess.run(['bash', script_path, input_csv_path, output_csv_path], check=True)

    # Read the model output file or the existing file to get the top 100 genes
    output_df = pd.read_csv(output_csv_path)
    # Update here to select only the required columns and rename them
    result_df = output_df[['geneSymbol', 'Prediction_score']].rename(columns={'geneSymbol': 'Gene', 'Prediction_score': 'Score'}).head(100)

    return result_df

iface = gr.Interface(
    fn=predict_top_100_genes, 
    inputs=gr.Textbox(lines=1, placeholder="Enter Disease ID Here...", label="Disease ID"), 
    outputs=gr.Dataframe(label="Predicted Top 100 Related Genes"), 
    title="Gene Disease Association Prediction",
    description = (
    "This AI model predicts the top 100 genes associated with a given disease based on 16,733 genes."
    " To get started, you need a Disease ID (UMLS CUI), which can be obtained from the DisGeNET database. "
    "\n\n**Steps to Obtain a Disease ID from DisGeNET:**\n"
    "1. Visit the DisGeNET website: [https://www.disgenet.org/search](https://www.disgenet.org/search).\n"
    "2. Use the search bar to enter your disease of interest. For instance, if you're interested in 'Alzheimer's Disease', type 'Alzheimer's Disease' into the search bar.\n"
    "3. From the search results, identify the disease you're researching. The Disease ID (UMLS CUI) is listed alongside each disease name, e.g. C0002395.\n"
    "4. Enter the Disease ID into the input box below and submit.\n\n"
    "The DisGeNET database contains all known gene-disease associations and associated evidence. In addition, it is able to find the corresponding diseases based on a gene.\n"
    "\n**The model will take about 18 minutes to inference a new disease.**\n"
    )
)

iface.launch(share=True)