from llama_cpp import Llama from concurrent.futures import ThreadPoolExecutor, as_completed import re import uvicorn from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware import os from dotenv import load_dotenv import gradio as gr import requests import asyncio from pydantic import BaseModel load_dotenv() HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN") global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}} model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}] models = {} def load_model(model_config): model_name = model_config['name'] try: model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN) models[model_name] = model global_data['models'] = models return model except Exception as e: print(f"Error loading model {model_name}: {e}") return None for config in model_configs: model = load_model(config) if model is None: exit(1) class ChatRequest(BaseModel): message: str def normalize_input(input_text): return input_text.strip() def remove_duplicates(text): lines = [line.strip() for line in text.split('\n') if line.strip()] return '\n'.join(dict.fromkeys(lines)) def generate_model_response(model, inputs): try: if model is None: return "Model loading failed." response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality return remove_duplicates(response['choices'][0]['text']) except Exception as e: print(f"Error generating response: {e}") return f"Error: {e}" app = FastAPI() origins = ["*"] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"] ) @app.post("/generate") async def generate(request: ChatRequest): inputs = normalize_input(request.message) chunk_size = 400 #Reduced chunk size chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)] overall_response = "" for chunk in chunks: with ThreadPoolExecutor() as executor: futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()] responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))] for response in responses: overall_response += f"**{response['model']}:**\n{response['response']}\n\n" return {"response": overall_response} async def process_message(message, history): try: port = os.environ.get("PORT", 7860) response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json() formatted_response = response["response"] history.append((message, formatted_response)) return history, history except requests.exceptions.RequestException as e: return history, f"Error: {e}" iface = gr.Interface( fn=process_message, inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])], outputs=[gr.Chatbot(), gr.State([])], title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs." ) if __name__ == "__main__": port = int(os.environ.get("PORT", 7860)) uvicorn.run(app, host="0.0.0.0", port=port) iface.launch(server_port=7860)