import os import langchain import sqlite3 from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.llms import OpenAI from langchain.chains import ConversationalRetrievalChain,RetrievalQA from langchain.document_loaders import UnstructuredPDFLoader import openai import os import PyPDF2 from langchain.document_loaders.csv_loader import CSVLoader from langchain import OpenAI, PromptTemplate from langchain.document_loaders import TextLoader, Docx2txtLoader, PyPDFLoader, UnstructuredExcelLoader, CSVLoader import logging from tqdm import tqdm from langchain.chat_models import ChatOpenAI from langchain.retrievers.multi_query import MultiQueryRetriever from langchain.chains.summarize import load_summarize_chain from langchain.text_splitter import RecursiveCharacterTextSplitter import pandas as pd import uuid from PIL import Image from utils import get_completion,model_info,model_load import pytesseract from langchain.text_splitter import CharacterTextSplitter from langchain.schema.document import Document def get_text_chunks_langchain(text): text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100) docs = [Document(page_content=x) for x in text_splitter.split_text(text)] return docs def get_text_img(path): return pytesseract.image_to_string(Image.open(path)) logging.basicConfig() logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO) base_path = os.path.join(os.getcwd(),"db") key_openai ="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" embedding = OpenAIEmbeddings(openai_api_key =key_openai) import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers.generation.utils import GenerationConfig tokenizer = AutoTokenizer.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", use_fast=False, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained("/home/ubuntu/LLM/text-generation-webui/models/Flmc_DISC-MedLLM", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True) model.generation_config = GenerationConfig.from_pretrained("Flmc/DISC-MedLLM") data_llm_16k = ChatOpenAI( model_name="gpt-3.5-turbo-16k", temperature = 0, openai_api_key=key_openai, ) data_llm = ChatOpenAI( model_name="gpt-3.5-turbo", temperature = 0, openai_api_key=key_openai, ) chain = load_summarize_chain(data_llm_16k, chain_type="stuff") def get_qa_chain_answers_llm(question,email): title = str(email) persist_directory = os.path.join(base_path,title) db = Chroma(persist_directory=persist_directory, embedding_function=embedding) k_tops = db.similarity_search(question, k=3) print(k_tops) #question_new = f" 'context' {k_tops}: '{question}'" #res = get_completion(question_new, 300, 0) print("LLM MODEL------------------------------") messages = [] messages.append({"role": "user", "content": "Hello the patient will provide you with the reports & other information regarding the paitent. You have to answer the questions based on the information provided and your knowledge in simplier langauge. Next you will talk with the paitent"}) model.chat(tokenizer, messages) messages.append({"role": "user", "content": f" Detials {k_tops} : & User Question {question}"}) return model.chat(tokenizer, messages) # def get_qa_chain_answers(question,email,history=[]): # title = str(email) # persist_directory = os.path.join(base_path,title) # db = Chroma(persist_directory=persist_directory, embedding_function=embedding) # # retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db.as_retriever(),llm=data_llm) # # unique_docs = retriever_from_llm.get_relevant_documents(query=question) # qa_chain = RetrievalQA.from_chain_type(data_llm_16k,retriever=db.as_retriever()) # question_updated = "Act Like a Medical doctor and give suggestions based on the context given or your own knwoelege and question asked" + question # answers = qa_chain({"query": question_updated}) # return answers['result'] def get_text(doc,file_name): file_extension = os.path.splitext(file_name)[1].lower() print(file_extension) if file_extension == ".pdf": pdf = PyPDF2.PdfReader(doc) pdf_text = "" for page in pdf.pages: pdf_text += page.extract_text() return pdf_text elif file_extension == ".md" or file_extension == ".txt": loader = TextLoader(doc) elif file_extension in [".docx", ".doc"]: loader = Docx2txtLoader(doc) elif file_extension == ".csv": loader = CSVLoader(file_path=doc) elif file_extension in [".xls", ".xlsx"]: try: df = pd.read_excel(doc, engine='openpyxl') file_name = f"{str(uuid.uuid1())}.csv" df.to_csv(file_name) loader = CSVLoader(file_path=file_name) except Exception as e: print(e) loader = UnstructuredExcelLoader(doc, mode="elements") documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100) texts = text_splitter.split_documents(documents) return texts elif file_extension == ".png" or file_extension == ".jpg" or file_extension == ".jpeg": texts = get_text_img(doc) text_docs = get_text_chunks_langchain(texts) return text_docs else: raise ValueError(f"Unsupported file extension: {file_extension}") documents = loader.load() text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) texts = text_splitter.split_documents(documents) return texts embedding = OpenAIEmbeddings(openai_api_key = "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") def upload_chroma(book_file,filename,email): pbar = tqdm(total=100) final_texts = get_text(book_file,filename) pbar.update(40) title = str(email) persist_directory = os.path.join(base_path,title) db = Chroma.from_documents(final_texts, embedding , persist_directory=persist_directory) pbar.update(40) db.persist() logging.info(f"Successfully uploaded the PDF of the book: {title}") print(f"Successfully uploaded the PDF of the book: {title}") pbar.update(20) pbar.close()