Spaces:

thinh111
/

ResvuChatbox

Runtime error

App Files Files Community

ResvuChatbox / model.py

thinh111

Update model.py

7e3c692 verified 6 months ago

raw

history blame contribute delete

4.63 kB

	import os
	str_cmd1 = 'pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"'
	str_cmd2 = 'pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes'
	os.system(str_cmd1)
	os.system(str_cmd2)
	#os.environ["CUDA_VISIBLE_DEVICES"] = "0" # or "0,1" for multiple GPUs


	from unsloth import FastLanguageModel
	import torch
	device = torch.device("cpu")
	max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
	dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
	load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

	from langchain_community.llms import CTransformers
	from langchain.chains import RetrievalQA
	from langchain.prompts import PromptTemplate
	from langchain_community.embeddings import GPT4AllEmbeddings
	from langchain_community.vectorstores import FAISS
	from langchain_community.llms import HuggingFacePipeline
	from langchain.callbacks.base import BaseCallbackHandler
	from transformers import pipeline

	# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
	fourbit_models = [
	"unsloth/mistral-7b-bnb-4bit",
	"unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
	"unsloth/llama-2-7b-bnb-4bit",
	"unsloth/llama-2-13b-bnb-4bit",
	"unsloth/codellama-34b-bnb-4bit",
	"unsloth/tinyllama-bnb-4bit",
	"unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
	"unsloth/gemma-2b-bnb-4bit",
	] # More models at https://huggingface.co/unsloth

	template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

	### Instruction:
	You are ResVuAssist and You are a helpful bot who reads texts and answers questions about them.

	### Input:
	{context}
	QUESTION: {question}

	### Response:
	"""

	# Cau hinh
	vector_db_path = "vectorstores/db_faiss"


	def initialModelAndTokenizer():
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
	max_seq_length = max_seq_length,
	dtype = dtype,
	load_in_4bit = load_in_4bit,
	# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
	)
	model = FastLanguageModel.get_peft_model(
	model,
	r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj",],
	lora_alpha = 16,
	lora_dropout = 0, # Supports any, but = 0 is optimized
	bias = "none", # Supports any, but = "none" is optimized
	# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
	use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
	random_state = 3407,
	use_rslora = False, # We support rank stabilized LoRA
	loftq_config = None, # And LoftQ
	)
	return model, tokenizer

	def create_pipeline():
	model, tokenizer = initialModelAndTokenizer()
	pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	max_new_tokens=512,
	temperature=0.1,
	top_p=0.95,
	repetition_penalty=1.15
	)
	return pipe

	# Tao prompt template
	def creat_prompt(template):
	prompt = PromptTemplate(template = template, input_variables=["context", "question"])
	return prompt

	# Tao simple chain
	def create_qa_chain(prompt, llm, db):
	llm_chain = RetrievalQA.from_chain_type(
	llm = llm,
	chain_type= "stuff",
	# retriever = db.as_retriever(search_kwargs = {"k":8}, max_tokens_limit=1024),
	retriever = db.as_retriever(search_kwargs = {"k": 15}, max_tokens_limit=4096),
	return_source_documents = False,
	chain_type_kwargs= {'prompt': prompt},
	)
	return llm_chain

	# Read tu VectorDB
	def read_vectors_db():
	# Embeding
	embedding_model = GPT4AllEmbeddings(model_name="all-MiniLM-L6-v2.gguf2.f16.gguf")
	db = FAISS.load_local(vector_db_path, embedding_model, allow_dangerous_deserialization=True)
	return db

	def get_response_value(text):
	start = text.find('### Response:')
	if start != -1:
	return text[start + len('### Response:'):].strip()
	return None

	def llm_chain_response():
	pipe = create_pipeline()
	db = read_vectors_db()
	prompt = creat_prompt(template)
	llm = HuggingFacePipeline(pipeline=pipe)

	llm_chain =create_qa_chain(prompt, llm, db)
	return llm_chain