Hjgugugjhuhjggg commited on
Commit
1f0a3a2
·
verified ·
1 Parent(s): 8806695

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -46
app.py CHANGED
@@ -14,28 +14,24 @@ from pydantic import BaseModel
14
  load_dotenv()
15
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
16
 
17
- global_data = {'models': {}, 'tokens': {'eos': 'eos_token', 'pad': 'pad_token', 'padding': 'padding_token', 'unk': 'unk_token', 'bos': 'bos_token', 'sep': 'sep_token', 'cls': 'cls_token', 'mask': 'mask_token'}}
18
-
19
  model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
20
-
21
  models = {}
22
 
23
  def load_model(model_config):
24
  model_name = model_config['name']
25
- if model_name not in models:
26
- try:
27
- model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
28
- models[model_name] = model
29
- global_data['models'] = models
30
- return model
31
- except Exception as e:
32
- print(f"Error loading model {model_name}: {e}")
33
- return None
34
 
35
  for config in model_configs:
36
  model = load_model(config)
37
  if model is None:
38
- print(f"Failed to load model {config['name']}. Exiting.")
39
  exit(1)
40
 
41
  class ChatRequest(BaseModel):
@@ -45,21 +41,14 @@ def normalize_input(input_text):
45
  return input_text.strip()
46
 
47
  def remove_duplicates(text):
48
- lines = text.split('\n')
49
- unique_lines = []
50
- seen_lines = set()
51
- for line in lines:
52
- line = line.strip()
53
- if line and line not in seen_lines:
54
- unique_lines.append(line)
55
- seen_lines.add(line)
56
- return '\n'.join(unique_lines)
57
 
58
  def generate_model_response(model, inputs):
59
  try:
60
  if model is None:
61
  return "Model loading failed."
62
- response = model(inputs, max_tokens=-1)
63
  return remove_duplicates(response['choices'][0]['text'])
64
  except Exception as e:
65
  print(f"Error generating response: {e}")
@@ -68,53 +57,39 @@ def generate_model_response(model, inputs):
68
  app = FastAPI()
69
  origins = ["*"]
70
  app.add_middleware(
71
- CORSMiddleware,
72
- allow_origins=origins,
73
- allow_credentials=True,
74
- allow_methods=["*"],
75
- allow_headers=["*"],
76
  )
77
 
78
  @app.post("/generate")
79
  async def generate(request: ChatRequest):
80
  inputs = normalize_input(request.message)
81
- chunk_size = 500
82
  chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
83
  overall_response = ""
 
84
  for chunk in chunks:
85
  with ThreadPoolExecutor() as executor:
86
  futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
87
- responses = [{'model': model_name, 'response': future.result()} for model_name, future in zip(models.keys(), as_completed(futures))]
88
-
89
- unique_responses = {}
90
- for response in responses:
91
- if response['model'] not in unique_responses and response['response']:
92
- unique_responses[response['model']] = response['response']
93
-
94
- chunk_response = ""
95
- for model, response in unique_responses.items():
96
- chunk_response += f"**{model}:**\n{response}\n\n"
97
- overall_response += chunk_response
98
-
99
  return {"response": overall_response}
100
 
101
  async def process_message(message, history):
102
  try:
103
- port = int(os.environ.get("PORT", 7860))
104
  response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
105
  formatted_response = response["response"]
106
  history.append((message, formatted_response))
107
  return history, history
108
  except requests.exceptions.RequestException as e:
109
- return history, f"Error communicating with the backend: {e}"
110
-
111
 
112
  iface = gr.Interface(
113
  fn=process_message,
114
  inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
115
  outputs=[gr.Chatbot(), gr.State([])],
116
- title="Multi-Model LLM API",
117
- description="Enter a message and get responses from multiple LLMs.",
118
  )
119
 
120
  if __name__ == "__main__":
 
14
  load_dotenv()
15
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
16
 
17
+ global_data = {'models': {}, 'tokens': {k: k + '_token' for k in ['eos', 'pad', 'padding', 'unk', 'bos', 'sep', 'cls', 'mask']}}
 
18
  model_configs = [{"repo_id": "Hjgugugjhuhjggg/mergekit-ties-tzamfyy-Q2_K-GGUF", "filename": "mergekit-ties-tzamfyy-q2_k.gguf", "name": "my_model"}]
 
19
  models = {}
20
 
21
  def load_model(model_config):
22
  model_name = model_config['name']
23
+ try:
24
+ model = Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'], use_auth_token=HUGGINGFACE_TOKEN)
25
+ models[model_name] = model
26
+ global_data['models'] = models
27
+ return model
28
+ except Exception as e:
29
+ print(f"Error loading model {model_name}: {e}")
30
+ return None
 
31
 
32
  for config in model_configs:
33
  model = load_model(config)
34
  if model is None:
 
35
  exit(1)
36
 
37
  class ChatRequest(BaseModel):
 
41
  return input_text.strip()
42
 
43
  def remove_duplicates(text):
44
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
45
+ return '\n'.join(dict.fromkeys(lines))
 
 
 
 
 
 
 
46
 
47
  def generate_model_response(model, inputs):
48
  try:
49
  if model is None:
50
  return "Model loading failed."
51
+ response = model(inputs, max_tokens=1000) #Reduced chunk size for safety
52
  return remove_duplicates(response['choices'][0]['text'])
53
  except Exception as e:
54
  print(f"Error generating response: {e}")
 
57
  app = FastAPI()
58
  origins = ["*"]
59
  app.add_middleware(
60
+ CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"]
 
 
 
 
61
  )
62
 
63
  @app.post("/generate")
64
  async def generate(request: ChatRequest):
65
  inputs = normalize_input(request.message)
66
+ chunk_size = 400 # Reduced chunk size further for this model
67
  chunks = [inputs[i:i + chunk_size] for i in range(0, len(inputs), chunk_size)]
68
  overall_response = ""
69
+
70
  for chunk in chunks:
71
  with ThreadPoolExecutor() as executor:
72
  futures = [executor.submit(generate_model_response, model, chunk) for model in models.values()]
73
+ responses = [{'model': name, 'response': future.result()} for name, future in zip(models, as_completed(futures))]
74
+ for response in responses: #Simplified response processing
75
+ overall_response += f"**{response['model']}:**\n{response['response']}\n\n"
 
 
 
 
 
 
 
 
 
76
  return {"response": overall_response}
77
 
78
  async def process_message(message, history):
79
  try:
80
+ port = os.environ.get("PORT", 7860)
81
  response = requests.post(f"http://localhost:{port}/generate", json={"message": message}).json()
82
  formatted_response = response["response"]
83
  history.append((message, formatted_response))
84
  return history, history
85
  except requests.exceptions.RequestException as e:
86
+ return history, f"Error: {e}"
 
87
 
88
  iface = gr.Interface(
89
  fn=process_message,
90
  inputs=[gr.Textbox(lines=2, placeholder="Enter your message here..."), gr.State([])],
91
  outputs=[gr.Chatbot(), gr.State([])],
92
+ title="Multi-Model LLM API", description="Enter a message and get responses from multiple LLMs."
 
93
  )
94
 
95
  if __name__ == "__main__":