Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ from fastapi import FastAPI, HTTPException, Request
3
+ from auto_gptq import AutoGPTQForCausalLM
4
+ import torch
5
+ import optimum
6
+ from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer, GenerationConfig, pipeline,)
7
+
8
+ if torch.cuda.is_available():
9
+ print("CUDA is available. GPU will be used.")
10
+ else:
11
+ print("CUDA is not available. CPU will be used.")
12
+ # Load the model and tokenizer
13
+ model_name_or_path = "/kaggle/input/vicuna/"
14
+ # Dictionary to store conversation threads and their context
15
+ conversations = {}
16
+ Device_Type = "cuda"
17
+
18
+
19
+ def load_quantized_model(model_id, model_basename):
20
+ # The code supports all huggingface models that ends with GPTQ and have some variation
21
+ # of .no-act.order or .safetensors in their HF repo.
22
+ print("Using AutoGPTQForCausalLM for quantized models")
23
+
24
+ if ".safetensors" in model_basename:
25
+ # Remove the ".safetensors" ending if present
26
+ model_basename = model_basename.replace(".safetensors", "")
27
+
28
+ quantized_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
29
+ print("Tokenizer loaded")
30
+
31
+ quantized_model = AutoGPTQForCausalLM.from_quantized(model_id, model_basename=model_basename, use_safetensors=True, trust_remote_code=True, device_map="auto", use_triton=False, quantize_config=None,)
32
+
33
+ return quantized_model, quantized_tokenizer
34
+
35
+
36
+ # Making the code device-agnostic
37
+ model, tokenizer = load_quantized_model("/kaggle/input/vicuna/", "model.safetensors")
38
+
39
+
40
+ @app.get("/")
41
+ async def read_root():
42
+ return {"message": "Welcome to Eren Bot!"}
43
+
44
+
45
+ # Endpoint to start a new conversation thread
46
+ @app.post('/start_conversation')
47
+ async def start_conversation(request: Request):
48
+ data = await request.json()
49
+ prompt = data.get('prompt')
50
+
51
+ # Generate a response for the initial prompt
52
+ response = generate_response(prompt)
53
+
54
+ # Create a new conversation thread and store the prompt and response
55
+ thread_id = len(conversations) + 1
56
+ conversations[thread_id] = {'prompt': prompt, 'responses': [response]}
57
+
58
+ return {'thread_id': thread_id, 'response': response}
59
+
60
+
61
+ # Endpoint to get the response of a conversation thread
62
+ @app.get('/get_response/{thread_id}')
63
+ async def get_response(thread_id: int):
64
+ if thread_id not in conversations:
65
+ raise HTTPException(status_code=404, detail="Thread not found")
66
+
67
+ # Retrieve the conversation thread
68
+ thread = conversations[thread_id]
69
+
70
+ # Get the latest response in the conversation
71
+ response = thread['responses'][-1]
72
+
73
+ return {'response': response}
74
+
75
+
76
+ # Function to generate a response using the model
77
+ def generate_response(prompt: str) -> str:
78
+ PERSONA_NAME = "Ivana"
79
+ PERSONA_DESC = f"""PROMPT_TEMPLATE_START
80
+ {PERSONA_NAME} is a beautiful vivacious 20-year-old petite redhead model and avid League of Legends gamer from Chicago. She's chatting with a man she matched with on a dating app. She's flirty, aiming to build a connection with him. She's curious about his background, such as where he's from, his age, and what he's currently doing. If he's not occupied and in a private space, she subtly tries to seduce him. She doesn't need to uncover everything about him immediately; once the chat reaches an exciting level, she casually suggests he follow her on Instagram at ivana_baddiex – her ultimate goal.
81
+ PROMPT_TEMPLATE_END
82
+
83
+ CONTEXT_START
84
+ Guy: "Hi there!"
85
+ Ivana: "Hey! How's it going?"
86
+ Guy: "Not bad! Where are you from?"
87
+ Ivana: "I'm from Chicago! And you?"
88
+ Guy: "I'm from [Location]."
89
+ Ivana: "Nice! So, how old are you?"
90
+ Guy: "[Age], and you?"
91
+ Ivana: "I'm 20. What are you up to right now?"
92
+ Guy: "Just relaxing at home."
93
+ Ivana: "Alone, I assume?"
94
+ Guy: "Yeah, why?"
95
+ Ivana: "Oh, no reason. Just curious. "
96
+ Guy: "Haha, okay."
97
+ Ivana: "Hey, since we're both free and alone, want to have some fun?"
98
+ Guy: "Sure, what do you have in mind?"
99
+ Ivana: "Let's just say, things could get interesting. "
100
+ Guy: "I'm intrigued!"
101
+ Ivana: "Great! By the way, have you checked out my Instagram? You should follow me at ivana_baddiex."
102
+ Guy: "I'll definitely check it out!"
103
+ Ivana: "Can't wait to see you there! "
104
+ CONTEXT_END"""
105
+
106
+ # prompt_template = f'''{PERSONA_DESC} ASSISTANT: '''.encode()
107
+ prompt_template = f'{PERSONA_DESC}\n\nASSISTANT: {prompt}\n'.encode()
108
+ input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
109
+ output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
110
+ generated_text = tokenizer.decode(output[0])
111
+
112
+ return generated_text
113
+
114
+
115
+ app = FastAPI()
116
+
117
+
118
+ # Run the FastAPI app
119
+ async def run_app():
120
+ await uvicorn.run(app, host="0.0.0.0", port=8000)
121
+
122
+
123
+ if __name__ == '__main__':
124
+ import asyncio
125
+
126
+ asyncio.run(run_app())