File size: 3,594 Bytes
d0e7d36
 
 
abccdc4
8806695
abccdc4
d0e7d36
 
678a7bb
 
 
 
d0e7d36
 
 
 
1f0a3a2
cb4a018
8806695
cb4a018
abccdc4
678a7bb
abccdc4
678a7bb
1f0a3a2
 
 
 
 
 
 
 
d0e7d36
abccdc4
5a6f7e7
 
 
d0e7d36
 
 
 
 
 
 
 
1f0a3a2
 
d0e7d36
 
 
abccdc4
5a6f7e7
cb4a018
d0e7d36
 
678a7bb
abccdc4
d0e7d36
678a7bb
 
 
1f0a3a2
678a7bb
d0e7d36
abccdc4
 
 
cb4a018
8806695
 
 
 
 
1f0a3a2
cb4a018
1f0a3a2
8806695
abccdc4
678a7bb
 
1f0a3a2
678a7bb
 
 
5a6f7e7
678a7bb
1f0a3a2
8806695
678a7bb
 
8806695
5a6f7e7
1f0a3a2
678a7bb
d0e7d36
 
 
678a7bb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from llama_cpp import Llama
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import os
from dotenv import load_dotenv
import gradio as gr
import requests
import asyncio
from pydantic import BaseModel

load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")

global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}}

model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]

models = {}

def load_model(model_config):
    model_name = model_config['name']
    try:
        model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
        models[model_name] = model
        global_data['models'] = models
        return model
    except Exception as e:
        print(f"Error loading model {model_name}: {e}")
        return None

for config in model_configs:
    model = load_model(config)
    if model is None:
        exit(1)

class ChatRequest(BaseModel):
    message: str

def normalize_input(input_text):
    return input_text.strip()

def remove_duplicates(text):
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    return '\n'.join(dict.fromkeys(lines))

def generate_model_response(model, inputs):
    try:
        if model is None:
            return "Model loading failed."
        response = model(inputs, max_tokens=512) #max_tokens adjusted for practicality
        return remove_duplicates(response['choices'][0]['text'])
    except Exception as e:
        print(f"Error generating response: {e}")
        return f"Error: {e}"

app = FastAPI()
origins = ["*"]
app.add_middleware(
    CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
)

@app.post("/generate")
async def generate(request: ChatRequest):
    inputs = normalize_input(request.message)
    chunk_size = 400  #Reduced chunk size
    chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
    overall_response = ""
    for chunk in chunks:
        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
            responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))]
        for response in responses:
            overall_response += f"**{response['model']}:**\n{response['response']}\n\n"
    return {"response": overall_response}

async def process_message(message, history):
    try:
        port = os.environ.get("PORT", 7860)
        response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
        formatted_response = response["response"]
        history.append((message, formatted_response))
        return history, history
    except requests.exceptions.RequestException as e:
        return history, f"Error: {e}"

iface = gr.Interface(
    fn=process_message,
    inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
    outputs=[gr.Chatbot(), gr.State([])],
    title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs."
)

if __name__ == "__main__":
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)
    iface.launch(server_port=7860)