from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from dotenv import load_dotenv
import gradio as gr
import requests
import asyncio
from pydantic import BaseModel

load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}}

model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]

models = {}

def load_model(model_config):
    model_name = model_config['name']
    try:
        model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
        models[model_name] = model
        global_data['models'] = models
        return model
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None

for config in model_configs:
    model = load_model(config)
    if model is None:
        exit(1)

class ChatRequest(BaseModel):
    message: str

def normalize_input(input_text):
    return input_text.strip()

def remove_duplicates(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    return '\n'.join(dict.fromkeys(lines))

def generate_model_response(model, inputs):
    try:
        if model is None:
            return "Model loading failed."
        response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality
        return remove_duplicates(response['choices'][0]['text'])
    except Exception as e:
        print(f"Error generating response: {e}")
        return f"Error: {e}"

app = FastAPI()
origins = ["*"]
app.add_middleware(
    CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
)

@app.post("/generate")
async def generate(request: ChatRequest):
    inputs = normalize_input(request.message)
    chunk_size = 400  #Reduced chunk size
    chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
    overall_response = ""
    for chunk in chunks:
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
            responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))]
        for response in responses:
            overall_response += f"**{response['model']}:**\n{response['response']}\n\n"
    return {"response": overall_response}

async def process_message(message, history):
    try:
        port = os.environ.get("PORT", 7860)
        response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
        formatted_response = response["response"]
        history.append((message, formatted_response))
        return history, history
    except requests.exceptions.RequestException as e:
        return history, f"Error: {e}"

iface = gr.Interface(
    fn=process_message,
    inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
    outputs=[gr.Chatbot(), gr.State([])],
    title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs."
)

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)
    iface.launch(server_port=7860)