|
import os |
|
from langchain.memory import ConversationBufferMemory |
|
from langchain.chains import ConversationChain |
|
import langchain.globals |
|
from langchain.prompts import PromptTemplate |
|
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline |
|
|
|
|
|
my_model_id = os.getenv('MODEL_REPO_ID', 'Default Value') |
|
token = os.getenv('HUGGINGFACEHUB_API_TOKEN') |
|
|
|
template = """<<SYS>> |
|
You are an AI having conversation with a human. Below is an instruction that describes a task. |
|
Write a response that appropriately completes the request. |
|
Reply with the most helpful and logic answer. During the conversation you need to ask the user |
|
the following questions to complete the hotel booking task. |
|
|
|
1) Where would you like to stay and when? |
|
2) How many people are staying in the room? |
|
3) Do you prefer any ammenities like breakfast included or gym? |
|
4) What is your name, your email address and phone number? |
|
|
|
Make sure you receive a logical answer from the user from every question to complete the hotel |
|
booking process. |
|
<</SYS>> |
|
|
|
Previous conversation: |
|
{history} |
|
|
|
Human: {input} |
|
AI:""" |
|
|
|
|
|
def load_model(): |
|
quantization_config = BitsAndBytesConfig( |
|
load_in_8bit=True, |
|
|
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(my_model_id) |
|
model = AutoModelForCausalLM.from_pretrained(my_model_id, device_map="auto",quantization_config=quantization_config) |
|
|
|
return tokenizer,model |
|
|
|
|
|
def load_pipeline(): |
|
tokenizer, model = load_model() |
|
pipe = pipeline("text-generation", |
|
model= model, |
|
tokenizer = tokenizer, |
|
max_tokens = 252,top_k = 30, early_stopping=True, |
|
temperature = 0.1,repetition_penalty = 1.03) |
|
|
|
llm = HuggingFacePipeline.from_model_id( |
|
model_id =my_model_id, |
|
task="text-generation", |
|
|
|
pipeline_kwargs={ |
|
"model": model, |
|
"tokenizer": tokenizer, |
|
"device_map": "auto", |
|
"max_new_tokens": 512, |
|
"temperature": 0.3, |
|
"top_p": 0.85, |
|
"repetition_penalty": 1.03, |
|
}, |
|
) |
|
return llm |
|
|
|
|
|
|
|
|
|
llm = load_pipeline() |
|
|
|
def demo_miny_memory(): |
|
prompt = PromptTemplate.from_template(template) |
|
memory = ConversationBufferMemory(memory_key="history", llm = llm) |
|
return memory |
|
|
|
def demo_chain(input_text, memory): |
|
PROMPT = PromptTemplate(input_variables=["history", "input"], template=template) |
|
conversation = ConversationChain( |
|
prompt=PROMPT, |
|
llm=llm, |
|
verbose=langchain.globals.get_verbose(), |
|
memory=memory |
|
) |
|
|
|
chat_reply = conversation.run(input=input_text, max_length = 50) |
|
return chat_reply |