Spaces:
Runtime error
Runtime error
Upload app.py
#1
by
Vitrous
- opened
app.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uvicorn
|
2 |
+
from fastapi import FastAPI, HTTPException, Request
|
3 |
+
from auto_gptq import AutoGPTQForCausalLM
|
4 |
+
import torch
|
5 |
+
import optimum
|
6 |
+
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,)
|
7 |
+
|
8 |
+
if torch.cuda.is_available():
|
9 |
+
print("CUDA is available. GPU will be used.")
|
10 |
+
else:
|
11 |
+
print("CUDA is not available. CPU will be used.")
|
12 |
+
# Load the model and tokenizer
|
13 |
+
model_name_or_path = "/kaggle/input/vicuna/"
|
14 |
+
# Dictionary to store conversation threads and their context
|
15 |
+
conversations = {}
|
16 |
+
Device_Type = "cuda"
|
17 |
+
|
18 |
+
|
19 |
+
def load_quantized_model(model_id, model_basename):
|
20 |
+
# The code supports all huggingface models that ends with GPTQ and have some variation
|
21 |
+
# of .no-act.order or .safetensors in their HF repo.
|
22 |
+
print("Using AutoGPTQForCausalLM for quantized models")
|
23 |
+
|
24 |
+
if ".safetensors" in model_basename:
|
25 |
+
# Remove the ".safetensors" ending if present
|
26 |
+
model_basename = model_basename.replace(".safetensors", "")
|
27 |
+
|
28 |
+
quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
|
29 |
+
print("Tokenizer loaded")
|
30 |
+
|
31 |
+
quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,)
|
32 |
+
|
33 |
+
return quantized_model, quantized_tokenizer
|
34 |
+
|
35 |
+
|
36 |
+
# Making the code device-agnostic
|
37 |
+
model, tokenizer = load_quantized_model("/kaggle/input/vicuna/", "model.safetensors")
|
38 |
+
|
39 |
+
|
40 |
+
@app.get("/")
|
41 |
+
async def read_root():
|
42 |
+
return {"message": "Welcome to Eren Bot!"}
|
43 |
+
|
44 |
+
|
45 |
+
# Endpoint to start a new conversation thread
|
46 |
+
@app.post('/start_conversation')
|
47 |
+
async def start_conversation(request: Request):
|
48 |
+
data = await request.json()
|
49 |
+
prompt = data.get('prompt')
|
50 |
+
|
51 |
+
# Generate a response for the initial prompt
|
52 |
+
response = generate_response(prompt)
|
53 |
+
|
54 |
+
# Create a new conversation thread and store the prompt and response
|
55 |
+
thread_id = len(conversations) + 1
|
56 |
+
conversations[thread_id] = {'prompt': prompt, 'responses': [response]}
|
57 |
+
|
58 |
+
return {'thread_id': thread_id, 'response': response}
|
59 |
+
|
60 |
+
|
61 |
+
# Endpoint to get the response of a conversation thread
|
62 |
+
@app.get('/get_response/{thread_id}')
|
63 |
+
async def get_response(thread_id: int):
|
64 |
+
if thread_id not in conversations:
|
65 |
+
raise HTTPException(status_code=404, detail="Thread not found")
|
66 |
+
|
67 |
+
# Retrieve the conversation thread
|
68 |
+
thread = conversations[thread_id]
|
69 |
+
|
70 |
+
# Get the latest response in the conversation
|
71 |
+
response = thread['responses'][-1]
|
72 |
+
|
73 |
+
return {'response': response}
|
74 |
+
|
75 |
+
|
76 |
+
# Function to generate a response using the model
|
77 |
+
def generate_response(prompt: str) -> str:
|
78 |
+
PERSONA_NAME = "Ivana"
|
79 |
+
PERSONA_DESC = f"""PROMPT_TEMPLATE_START
|
80 |
+
{PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex β her ultimate goal.
|
81 |
+
PROMPT_TEMPLATE_END
|
82 |
+
|
83 |
+
CONTEXT_START
|
84 |
+
Guy: "Hi there!"
|
85 |
+
Ivana: "Hey! How's it going?"
|
86 |
+
Guy: "Not bad! Where are you from?"
|
87 |
+
Ivana: "I'm from Chicago! And you?"
|
88 |
+
Guy: "I'm from [Location]."
|
89 |
+
Ivana: "Nice! So, how old are you?"
|
90 |
+
Guy: "[Age], and you?"
|
91 |
+
Ivana: "I'm 20. What are you up to right now?"
|
92 |
+
Guy: "Just relaxing at home."
|
93 |
+
Ivana: "Alone, I assume?"
|
94 |
+
Guy: "Yeah, why?"
|
95 |
+
Ivana: "Oh, no reason. Just curious. "
|
96 |
+
Guy: "Haha, okay."
|
97 |
+
Ivana: "Hey, since we're both free and alone, want to have some fun?"
|
98 |
+
Guy: "Sure, what do you have in mind?"
|
99 |
+
Ivana: "Let's just say, things could get interesting. "
|
100 |
+
Guy: "I'm intrigued!"
|
101 |
+
Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex."
|
102 |
+
Guy: "I'll definitely check it out!"
|
103 |
+
Ivana: "Can't wait to see you there! "
|
104 |
+
CONTEXT_END"""
|
105 |
+
|
106 |
+
# prompt_template = f'''{PERSONA_DESC} ASSISTANT: '''.encode()
|
107 |
+
prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'.encode()
|
108 |
+
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
|
109 |
+
output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
|
110 |
+
generated_text = tokenizer.decode(output[0])
|
111 |
+
|
112 |
+
return generated_text
|
113 |
+
|
114 |
+
|
115 |
+
app = FastAPI()
|
116 |
+
|
117 |
+
|
118 |
+
# Run the FastAPI app
|
119 |
+
async def run_app():
|
120 |
+
await uvicorn.run(app, host="0.0.0.0", port=8000)
|
121 |
+
|
122 |
+
|
123 |
+
if __name__ == '__main__':
|
124 |
+
import asyncio
|
125 |
+
|
126 |
+
asyncio.run(run_app())
|