import os import gradio as gr import langchain from langchain_community.llms import HuggingFaceEndpoint from langchain_community.vectorstores import Chroma # Light-weight and in memory from langchain.chains import RetrievalQA #from langchain.chains import ConversationalRetrievalChain #from langchain.memory import ConversationBufferMemory # Authentication for Huggingface API HF_TOKEN = os.getenv("HF_TOKEN") os.environ["HF_TOKEN"] = HF_TOKEN os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN # Initialization of LLM llm = HuggingFaceEndpoint( repo_id="HuggingFaceH4/zephyr-7b-beta", task="text-generation", max_new_tokens = 512, top_k = 30, temperature = 0.1, repetition_penalty = 1.03, ) ## Embeddings from langchain_community.embeddings import HuggingFaceEmbeddings #modelPath = "sentence-transformers/all-MiniLM-l6-v2" modelPath ="mixedbread-ai/mxbai-embed-large-v1" # Create a dictionary with model configuration options, specifying to use the CPU for computations model_kwargs = {'device':'cpu'} # cuda/cpu # Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False encode_kwargs = {'normalize_embeddings': False} embedding = HuggingFaceEmbeddings( model_name=modelPath, # Provide the pre-trained model's path model_kwargs=model_kwargs, # Pass the model configuration options encode_kwargs=encode_kwargs # Pass the encoding options ) # Upload the vector db from previous step and unzip persist_directory = 'docs/chroma/' vectordb = Chroma( persist_directory=persist_directory, embedding_function=embedding ) # Adding memory #memory = ConversationBufferMemory(memory_key="chat_history",return_messages=True) title = "Q&A on enterprise data" description = "Implementation of Open Source RAG on Private Document" def quena(question): qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectordb.as_retriever(search_type="mmr",search_kwargs={"k": 8, "fetch_k":12}), return_source_documents=True) #qa_chain = ConversationalRetrievalChain.from_llm(llm,retriever=vectordb.as_retriever(search_type="mmr",search_kwargs={"k": 10, "fetch_k":15} ),memory=memory) result = qa_chain.invoke({"query": question}) # qa_chain.invoke({"question": question}) return result["result"] # result['answer'] demo=gr.Interface(fn=quena, inputs=gr.Textbox(lines=10,placeholder='''Write your question inside double quotation..Type the Sample Question:\n What are the procedures to move from research to production environment?? Reply in step-wise pointers.'''), outputs="text", title=title, description=description,) # Launch the demo! demo.launch(share=True)