asasasText / app.py
Hjgugugjhuhjggg's picture
Update app.py
cb4a018 verified
raw
history blame
3.59 kB
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from dotenv import load_dotenv
import gradio as gr
import requests
import asyncio
from pydantic import BaseModel
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}}
model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
models = {}
def load_model(model_config):
model_name = model_config['name']
try:
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
models[model_name] = model
global_data['models'] = models
return model
except Exception as e:
print(f"Error loading model {model_name}: {e}")
return None
for config in model_configs:
model = load_model(config)
if model is None:
exit(1)
class ChatRequest(BaseModel):
message: str
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
lines = [line.strip() for line in text.split('\n') if line.strip()]
return '\n'.join(dict.fromkeys(lines))
def generate_model_response(model, inputs):
try:
if model is None:
return "Model loading failed."
response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality
return remove_duplicates(response['choices'][0]['text'])
except Exception as e:
print(f"Error generating response: {e}")
return f"Error: {e}"
app = FastAPI()
origins = ["*"]
app.add_middleware(
CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
)
@app.post("/generate")
async def generate(request: ChatRequest):
inputs = normalize_input(request.message)
chunk_size = 400 #Reduced chunk size
chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
overall_response = ""
for chunk in chunks:
with ThreadPoolExecutor() as executor:
futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))]
for response in responses:
overall_response += f"**{response['model']}:**\n{response['response']}\n\n"
return {"response": overall_response}
async def process_message(message, history):
try:
port = os.environ.get("PORT", 7860)
response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
formatted_response = response["response"]
history.append((message, formatted_response))
return history, history
except requests.exceptions.RequestException as e:
return history, f"Error: {e}"
iface = gr.Interface(
fn=process_message,
inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
outputs=[gr.Chatbot(), gr.State([])],
title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs."
)
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)
iface.launch(server_port=7860)