from unsloth import FastLanguageModel import torch max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. from langchain_community.llms import CTransformers from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain_community.embeddings import GPT4AllEmbeddings from langchain_community.vectorstores import FAISS from langchain_community.llms import HuggingFacePipeline from langchain.callbacks.base import BaseCallbackHandler from transformers import pipeline # 4bit pre quantized models we support for 4x faster downloading + no OOMs. fourbit_models = [ "unsloth/mistral-7b-bnb-4bit", "unsloth/mistral-7b-instruct-v0.2-bnb-4bit", "unsloth/llama-2-7b-bnb-4bit", "unsloth/llama-2-13b-bnb-4bit", "unsloth/codellama-34b-bnb-4bit", "unsloth/tinyllama-bnb-4bit", "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster! "unsloth/gemma-2b-bnb-4bit", ] # More models at https://huggingface.co/unsloth template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: You are ResVuAssist and You are a helpful bot who reads texts and answers questions about them. ### Input: {context} QUESTION: {question} ### Response: """ # Cau hinh vector_db_path = "vectorstores/db_faiss" def initialModelAndTokenizer(): model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit, # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf ) model = FastLanguageModel.get_peft_model( model, r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128 target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 16, lora_dropout = 0, # Supports any, but = 0 is optimized bias = "none", # Supports any, but = "none" is optimized # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes! use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context random_state = 3407, use_rslora = False, # We support rank stabilized LoRA loftq_config = None, # And LoftQ ) return model, tokenizer def create_pipeline(): model, tokenizer = initialModelAndTokenizer() pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512, temperature=0.1, top_p=0.95, repetition_penalty=1.15 ) return pipe # Tao prompt template def creat_prompt(template): prompt = PromptTemplate(template = template, input_variables=["context", "question"]) return prompt # Tao simple chain def create_qa_chain(prompt, llm, db): llm_chain = RetrievalQA.from_chain_type( llm = llm, chain_type= "stuff", # retriever = db.as_retriever(search_kwargs = {"k":8}, max_tokens_limit=1024), retriever = db.as_retriever(search_kwargs = {"k": 15}, max_tokens_limit=4096), return_source_documents = False, chain_type_kwargs= {'prompt': prompt}, ) return llm_chain # Read tu VectorDB def read_vectors_db(): # Embeding embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf") db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True) return db def get_response_value(text): start = text.find('### Response:') if start != -1: return text[start + len('### Response:'):].strip() return None def llm_chain_response(): pipe = create_pipeline() db = read_vectors_db() prompt = creat_prompt(template) llm = HuggingFacePipeline(pipeline=pipe) llm_chain =create_qa_chain(prompt, llm, db) return llm_chain