import torch import torch.nn as nn import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder, StandardScaler from sentence_transformers import SentenceTransformer, util import json from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline from symspellpy import SymSpell, Verbosity device = torch.device("cpu") class DiseaseClassifier(nn.Module): def __init__(self, input_size, num_classes, dropout_rate=0.35665610394511454): super(DiseaseClassifier, self).__init__() self.fc1 = nn.Linear(input_size, 382) self.fc2 = nn.Linear(382, 389) self.fc3 = nn.Linear(389, 433) self.fc4 = nn.Linear(433, num_classes) self.activation = nn.LeakyReLU() self.dropout = nn.Dropout(dropout_rate) def forward(self, x): x = self.activation(self.fc1(x)) x = self.dropout(x) x = self.activation(self.fc2(x)) x = self.dropout(x) x = self.activation(self.fc3(x)) x = self.dropout(x) x = self.fc4(x) # Logits return x class DiseasePredictionModel: def __init__(self, ai_model_name="model.pth", data_file="data.csv", symptom_json="symptoms.json", dictionary_file="frequency_dictionary_en_82_765.txt"): self.df = pd.read_csv(data_file) self.symptom_columns = self.load_symptoms(symptom_json) self.label_encoder = LabelEncoder() self.label_encoder.fit(self.df.iloc[:, 0]) self.scaler = StandardScaler() self.scaler.fit(self.df.iloc[:, 1:].values) self.input_size = len(self.symptom_columns) self.num_classes = len(self.label_encoder.classes_) self.model = self._load_model(ai_model_name) self.SYMPTOM_LIST = self.load_symptoms(symptom_json) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.sym_spell.load_dictionary(dictionary_file, term_index=0, count_index=1) self.tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner") self.nlp_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner") self.ner_pipeline = pipeline("ner", model=self.nlp_model, tokenizer=self.tokenizer, aggregation_strategy="simple") self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2') def _load_model(self, ai_model_name): model = DiseaseClassifier(self.input_size, self.num_classes).to(device) model.load_state_dict(torch.load(ai_model_name, map_location=device, weights_only=True)) model.eval() return model def predict_disease(self, symptoms): input_vector = np.zeros(len(self.symptom_columns)) for symptom in symptoms: if symptom in self.symptom_columns: input_vector[list(self.symptom_columns).index(symptom)] = 1 input_vector = self.scaler.transform([input_vector]) input_tensor = torch.tensor(input_vector, dtype=torch.float32).to(device) with torch.no_grad(): outputs = self.model(input_tensor) _, predicted_class = torch.max(outputs, 1) predicted_disease = self.label_encoder.inverse_transform([predicted_class.cpu().numpy()[0]])[0] return predicted_disease def load_symptoms(self, json_file): with open(json_file, "r", encoding="utf-8") as f: return json.load(f) def correct_text(self, text): words = text.split() corrected_words = [] for word in words: if word.lower() in [symptom.lower() for symptom in self.SYMPTOM_LIST]: corrected_words.append(word) else: suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) if suggestions: corrected_words.append(suggestions[0].term) else: corrected_words.append(word) return ' '.join(corrected_words) def extract_symptoms(self, text): ner_results = self.ner_pipeline(text) symptoms = set() for entity in ner_results: if entity["entity_group"] == "DISEASE": symptoms.add(entity["word"].lower()) return list(symptoms) def match_symptoms(self, extracted_symptoms): matched = {} symptom_embeddings = self.semantic_model.encode(self.SYMPTOM_LIST, convert_to_tensor=True) for symptom in extracted_symptoms: symptom_embedding = self.semantic_model.encode(symptom, convert_to_tensor=True) similarities = util.pytorch_cos_sim(symptom_embedding, symptom_embeddings)[0] most_similar_idx = similarities.argmax() best_match = self.SYMPTOM_LIST[most_similar_idx] matched[symptom] = best_match return matched.values()