import os import spaces import gradio as gr import torch torch.jit.script = lambda f: f # Avoid script error in lambda # Initialize non-GPU components first from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.vectorstores import Chroma from langchain.prompts import PromptTemplate from langchain.chains import RetrievalQA # System prompts DEFAULT_SYSTEM_PROMPT = """ Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context, respond with "I don't know" or a similar acknowledgment that the answer is not available. """.strip() SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.?" def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str: return f""" [INST] <> {system_prompt} <> {prompt} [/INST] """.strip() template = generate_prompt( """ {context} Question: {question} """, system_prompt=SYSTEM_PROMPT, ) prompt_template = PromptTemplate(template=template, input_variables=["context", "question"]) # Initialize database and embeddings embeddings = HuggingFaceInstructEmbeddings( model_name="hkunlp/instructor-base", model_kwargs={"device": "cpu"} ) db = Chroma( persist_directory="db", embedding_function=embeddings ) def initialize_model(): from transformers import AutoTokenizer, TextStreamer, pipeline, AutoModelForCausalLM from langchain.llms import HuggingFacePipeline model_id = "meta-llama/Llama-3.2-3B-Instruct" token = os.environ.get("HF_TOKEN") tokenizer = AutoTokenizer.from_pretrained(model_id, token=token) model = AutoModelForCausalLM.from_pretrained( model_id, token=token, ) if torch.cuda.is_available(): model = model.to("cuda") return model, tokenizer @spaces.GPU def respond(message, history, system_message, max_tokens, temperature, top_p): try: # Initialize model components inside GPU context model, tokenizer = initialize_model() from transformers import TextStreamer, pipeline streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) text_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, repetition_penalty=1.15, streamer=streamer, ) llm = HuggingFacePipeline(pipeline=text_pipeline) qa_chain = RetrievalQA.from_chain_type( llm=llm, chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 2}), return_source_documents=False, chain_type_kwargs={"prompt": prompt_template} ) response = qa_chain.invoke({"query": message}) yield response["result"] except Exception as e: yield f"An error occurred: {str(e)}" # Create Gradio interface demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox( value=DEFAULT_SYSTEM_PROMPT, label="System Message", lines=3, visible=False ), gr.Slider( minimum=1, maximum=2048, value=500, step=1, label="Max new tokens" ), gr.Slider( minimum=0.1, maximum=4.0, value=0.1, step=0.1, label="Temperature" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)" ), ], title="ROS2 Expert Assistant", description="Ask questions about ROS2, navigation, and robotics. I'll provide concise answers based on the available documentation.", ) if __name__ == "__main__": demo.launch()