Spaces:
Runtime error
Runtime error
File size: 3,594 Bytes
d0e7d36 abccdc4 8806695 abccdc4 d0e7d36 678a7bb d0e7d36 1f0a3a2 cb4a018 8806695 cb4a018 abccdc4 678a7bb abccdc4 678a7bb 1f0a3a2 d0e7d36 abccdc4 5a6f7e7 d0e7d36 1f0a3a2 d0e7d36 abccdc4 5a6f7e7 cb4a018 d0e7d36 678a7bb abccdc4 d0e7d36 678a7bb 1f0a3a2 678a7bb d0e7d36 abccdc4 cb4a018 8806695 1f0a3a2 cb4a018 1f0a3a2 8806695 abccdc4 678a7bb 1f0a3a2 678a7bb 5a6f7e7 678a7bb 1f0a3a2 8806695 678a7bb 8806695 5a6f7e7 1f0a3a2 678a7bb d0e7d36 678a7bb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from dotenv import load_dotenv
import gradio as gr
import requests
import asyncio
from pydantic import BaseModel
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}}
model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
models = {}
def load_model(model_config):
model_name = model_config['name']
try:
model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
models[model_name] = model
global_data['models'] = models
return model
except Exception as e:
print(f"Error loading model {model_name}: {e}")
return None
for config in model_configs:
model = load_model(config)
if model is None:
exit(1)
class ChatRequest(BaseModel):
message: str
def normalize_input(input_text):
return input_text.strip()
def remove_duplicates(text):
lines = [line.strip() for line in text.split('\n') if line.strip()]
return '\n'.join(dict.fromkeys(lines))
def generate_model_response(model, inputs):
try:
if model is None:
return "Model loading failed."
response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality
return remove_duplicates(response['choices'][0]['text'])
except Exception as e:
print(f"Error generating response: {e}")
return f"Error: {e}"
app = FastAPI()
origins = ["*"]
app.add_middleware(
CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
)
@app.post("/generate")
async def generate(request: ChatRequest):
inputs = normalize_input(request.message)
chunk_size = 400 #Reduced chunk size
chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
overall_response = ""
for chunk in chunks:
with ThreadPoolExecutor() as executor:
futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))]
for response in responses:
overall_response += f"**{response['model']}:**\n{response['response']}\n\n"
return {"response": overall_response}
async def process_message(message, history):
try:
port = os.environ.get("PORT", 7860)
response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
formatted_response = response["response"]
history.append((message, formatted_response))
return history, history
except requests.exceptions.RequestException as e:
return history, f"Error: {e}"
iface = gr.Interface(
fn=process_message,
inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
outputs=[gr.Chatbot(), gr.State([])],
title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs."
)
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)
iface.launch(server_port=7860) |