Spaces:
Runtime error
Runtime error
from llama_cpp import Llama | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
import re | |
import uvicorn | |
from fastapi import FastAPI | |
from fastapi.middleware.cors import CORSMiddleware | |
import os | |
from dotenv import load_dotenv | |
import gradio as gr | |
import requests | |
import asyncio | |
from pydantic import BaseModel | |
load_dotenv() | |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") | |
global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}} | |
model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}] | |
models = {} | |
def load_model(model_config): | |
model_name = model_config['name'] | |
try: | |
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN) | |
models[model_name] = model | |
global_data['models'] = models | |
return model | |
except Exception as e: | |
print(f"Error loading model {model_name}: {e}") | |
return None | |
for config in model_configs: | |
model = load_model(config) | |
if model is None: | |
exit(1) | |
class ChatRequest(BaseModel): | |
message: str | |
def normalize_input(input_text): | |
return input_text.strip() | |
def remove_duplicates(text): | |
lines = [line.strip() for line in text.split('\n') if line.strip()] | |
return '\n'.join(dict.fromkeys(lines)) | |
def generate_model_response(model, inputs): | |
try: | |
if model is None: | |
return "Model loading failed." | |
response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality | |
return remove_duplicates(response['choices'][0]['text']) | |
except Exception as e: | |
print(f"Error generating response: {e}") | |
return f"Error: {e}" | |
app = FastAPI() | |
origins = ["*"] | |
app.add_middleware( | |
CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"] | |
) | |
async def generate(request: ChatRequest): | |
inputs = normalize_input(request.message) | |
chunk_size = 400 #Reduced chunk size | |
chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)] | |
overall_response = "" | |
for chunk in chunks: | |
with ThreadPoolExecutor() as executor: | |
futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()] | |
responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))] | |
for response in responses: | |
overall_response += f"**{response['model']}:**\n{response['response']}\n\n" | |
return {"response": overall_response} | |
async def process_message(message, history): | |
try: | |
port = os.environ.get("PORT", 7860) | |
response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json() | |
formatted_response = response["response"] | |
history.append((message, formatted_response)) | |
return history, history | |
except requests.exceptions.RequestException as e: | |
return history, f"Error: {e}" | |
iface = gr.Interface( | |
fn=process_message, | |
inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])], | |
outputs=[gr.Chatbot(), gr.State([])], | |
title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs." | |
) | |
if __name__ == "__main__": | |
port = int(os.environ.get("PORT", 7860)) | |
uvicorn.run(app, host="0.0.0.0", port=port) | |
iface.launch(server_port=7860) |