Spaces:
Sleeping
Sleeping
File size: 3,320 Bytes
7c46397 a1af661 7c46397 a1af661 7c46397 a1af661 7c46397 a1af661 7c46397 a1af661 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
# -*- coding: utf-8 -*-
import gradio as gr
import pandas as pd
import os
import subprocess
def predict_top_100_genes(disease_id):
# Initialize paths
input_csv_path = 'data/downstream/{}_disease.csv'.format(disease_id)
output_csv_path = 'data/downstream/{}_top100.csv'.format(disease_id)
# Check if the output CSV already exists
if not os.path.exists(output_csv_path):
# Proceed with your existing code if the output file doesn't exist
df = pd.read_csv('data/pretrain/disgenet_latest.csv')
df = df[df['proteinSeq'].notna()]
# Check if the disease_id is present in the dataframe
if disease_id not in df['diseaseId'].values:
return f"Error: Disease ID '{disease_id}' not found in the database. Please check the ID and try again."
desired_diseaseDes = df[df['diseaseId'] == disease_id]['diseaseDes'].iloc[0]
related_proteins = df[df['diseaseDes'] == desired_diseaseDes]['proteinSeq'].unique()
df['score'] = df['proteinSeq'].isin(related_proteins).astype(int)
new_df = pd.DataFrame({
'diseaseId': disease_id,
'diseaseDes': desired_diseaseDes,
'geneSymbol': df['geneSymbol'],
'proteinSeq': df['proteinSeq'],
'score': df['score']
}).drop_duplicates().reset_index(drop=True)
new_df.to_csv(input_csv_path, index=False)
# Call the model script only if the output CSV does not exist
script_path = 'model.sh'
subprocess.run(['bash', script_path, input_csv_path, output_csv_path], check=True)
# Read the model output file or the existing file to get the top 100 genes
output_df = pd.read_csv(output_csv_path)
# Update here to select only the required columns and rename them
result_df = output_df[['geneSymbol', 'Prediction_score']].rename(columns={'geneSymbol': 'Gene', 'Prediction_score': 'Score'}).head(100)
return result_df
iface = gr.Interface(
fn=predict_top_100_genes,
inputs=gr.Textbox(lines=1, placeholder="Enter Disease ID Here...", label="Disease ID"),
outputs=gr.Dataframe(label="Predicted Top 100 Related Genes"),
title="Gene Disease Association Prediction",
description = (
"This AI model predicts the top 100 genes associated with a given disease based on 16,733 genes."
" To get started, you need a Disease ID (UMLS CUI), which can be obtained from the DisGeNET database. "
"\n\n**Steps to Obtain a Disease ID from DisGeNET:**\n"
"1. Visit the DisGeNET website: [https://www.disgenet.org/search](https://www.disgenet.org/search).\n"
"2. Use the search bar to enter your disease of interest. For instance, if you're interested in 'Alzheimer's Disease', type 'Alzheimer's Disease' into the search bar.\n"
"3. From the search results, identify the disease you're researching. The Disease ID (UMLS CUI) is listed alongside each disease name, e.g. C0002395.\n"
"4. Enter the Disease ID into the input box below and submit.\n\n"
"The DisGeNET database contains all known gene-disease associations and associated evidence. In addition, it is able to find the corresponding diseases based on a gene.\n"
"\n**The model will take about 18 minutes to inference a new disease.**\n"
)
)
iface.launch(share=True)
|