Spaces:

itskdhere
/

quickaid

Runtime error

File size: 4,974 Bytes

aec98fe

import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sentence_transformers import SentenceTransformer, util
import json
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from symspellpy import SymSpell, Verbosity

device = torch.device("cpu")

class DiseaseClassifier(nn.Module):
    def __init__(self, input_size, num_classes, dropout_rate=0.35665610394511454):
        super(DiseaseClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 382)
        self.fc2 = nn.Linear(382, 389)
        self.fc3 = nn.Linear(389, 433)
        self.fc4 = nn.Linear(433, num_classes)
        self.activation = nn.LeakyReLU()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.dropout(x)
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)  # Logits
        return x


class DiseasePredictionModel:
    def __init__(self, ai_model_name="model.pth", data_file="data.csv", symptom_json="symptoms.json", dictionary_file="frequency_dictionary_en_82_765.txt"):
        self.df = pd.read_csv(data_file)
        self.symptom_columns = self.load_symptoms(symptom_json)
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(self.df.iloc[:, 0])
        self.scaler = StandardScaler()
        self.scaler.fit(self.df.iloc[:, 1:].values)
        self.input_size = len(self.symptom_columns)
        self.num_classes = len(self.label_encoder.classes_)
        self.model = self._load_model(ai_model_name)
        self.SYMPTOM_LIST = self.load_symptoms(symptom_json)
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        self.sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1)
        self.tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
        self.nlp_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner")
        self.ner_pipeline = pipeline("ner", model=self.nlp_model, tokenizer=self.tokenizer, aggregation_strategy="simple")
        self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

    def _load_model(self, ai_model_name):
        model = DiseaseClassifier(self.input_size, self.num_classes).to(device)
        model.load_state_dict(torch.load(ai_model_name, map_location=device, weights_only=True))
        model.eval()
        return model

    def predict_disease(self, symptoms):
        input_vector = np.zeros(len(self.symptom_columns))
        for symptom in symptoms:
            if symptom in self.symptom_columns:
                input_vector[list(self.symptom_columns).index(symptom)] = 1

        input_vector = self.scaler.transform([input_vector])

        input_tensor = torch.tensor(input_vector, dtype=torch.float32).to(device)

        with torch.no_grad():
            outputs = self.model(input_tensor)
            _, predicted_class = torch.max(outputs, 1)

        predicted_disease = self.label_encoder.inverse_transform([predicted_class.cpu().numpy()[0]])[0]
        return predicted_disease

    def load_symptoms(self, json_file):
        with open(json_file, "r", encoding="utf-8") as f:
            return json.load(f)

    def correct_text(self, text):
        words = text.split()
        corrected_words = []

        for word in words:
            if word.lower() in [symptom.lower() for symptom in self.SYMPTOM_LIST]:
                corrected_words.append(word)
            else:
                suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2)
                if suggestions:
                    corrected_words.append(suggestions[0].term)
                else:
                    corrected_words.append(word)
        return ' '.join(corrected_words)

    def extract_symptoms(self, text):
        ner_results = self.ner_pipeline(text)
        symptoms = set()
        for entity in ner_results:
            if entity["entity_group"] == "DISEASE":
                symptoms.add(entity["word"].lower())
        return list(symptoms)

    def match_symptoms(self, extracted_symptoms):
        matched = {}

        symptom_embeddings = self.semantic_model.encode(self.SYMPTOM_LIST, convert_to_tensor=True)

        for symptom in extracted_symptoms:
            symptom_embedding = self.semantic_model.encode(symptom, convert_to_tensor=True)

            similarities = util.pytorch_cos_sim(symptom_embedding, symptom_embeddings)[0]

            most_similar_idx = similarities.argmax()
            best_match = self.SYMPTOM_LIST[most_similar_idx]
            matched[symptom] = best_match

        return matched.values()