Uhhy commited on
Commit
1608585
1 Parent(s): e3a7b6f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -20
app.py CHANGED
@@ -1,11 +1,12 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
- from concurrent.futures import ThreadPoolExecutor, as_completed
5
  import uvicorn
6
  from dotenv import load_dotenv
7
  from difflib import SequenceMatcher
8
  from tqdm import tqdm
 
9
 
10
  load_dotenv()
11
 
@@ -19,9 +20,26 @@ models = [
19
  {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
20
  ]
21
 
22
- # Cargar modelos en RAM solo una vez
23
- llms = [Llama.from_pretrained(repo_id=model['repo_id'], filename=model['filename']) for model in models]
24
- print(f"Modelos cargados en RAM: {[model['repo_id'] for model in models]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  class ChatRequest(BaseModel):
27
  message: str
@@ -46,9 +64,24 @@ def generate_chat_response(request, llm):
46
  def normalize_input(input_text):
47
  return input_text.strip()
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def select_best_response(responses):
 
 
50
  # Deduplicar respuestas
51
- unique_responses = list(set(responses))
52
  # Filtrar respuestas coherentes
53
  coherent_responses = filter_by_coherence(unique_responses)
54
  # Seleccionar la mejor respuesta
@@ -76,33 +109,27 @@ async def generate_chat(request: ChatRequest):
76
 
77
  print(f"Procesando solicitud: {request.message}")
78
 
79
- # Utilizar un ThreadPoolExecutor para procesar los modelos en paralelo
80
- with ThreadPoolExecutor() as executor:
81
- futures = [executor.submit(generate_chat_response, request, llm) for llm in llms]
 
 
 
82
  responses = []
83
 
84
  for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas"):
85
  response = future.result()
86
- responses.append(response)
87
  print(f"Modelo procesado: {response['literal'][:30]}...")
88
 
89
- # Extraer respuestas de los diccionarios
90
- response_texts = [resp['response'] for resp in responses]
91
-
92
- # Verificar si hay errores en las respuestas
93
- error_responses = [resp for resp in responses if "Error" in resp['response']]
94
- if error_responses:
95
- error_response = error_responses[0]
96
- raise HTTPException(status_code=500, detail=error_response['response'])
97
-
98
  # Seleccionar la mejor respuesta
99
- best_response = select_best_response(response_texts)
100
 
101
  print(f"Mejor respuesta seleccionada: {best_response}")
102
 
103
  return {
104
  "best_response": best_response,
105
- "all_responses": response_texts
106
  }
107
 
108
  if __name__ == "__main__":
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
  import uvicorn
6
  from dotenv import load_dotenv
7
  from difflib import SequenceMatcher
8
  from tqdm import tqdm
9
+ import multiprocessing
10
 
11
  load_dotenv()
12
 
 
20
  {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
21
  ]
22
 
23
+ # Función para cargar un modelo
24
+ def load_model(model_config):
25
+ return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
26
+
27
+ # Cargar modelos en paralelo
28
+ def load_all_models():
29
+ with ProcessPoolExecutor() as executor:
30
+ future_to_model = {executor.submit(load_model, model): model for model in models}
31
+ loaded_models = {}
32
+ for future in as_completed(future_to_model):
33
+ model = future_to_model[future]
34
+ try:
35
+ loaded_models[model['repo_id']] = future.result()
36
+ print(f"Modelo cargado en RAM: {model['repo_id']}")
37
+ except Exception as exc:
38
+ print(f"Error al cargar modelo {model['repo_id']}: {exc}")
39
+ return loaded_models
40
+
41
+ # Cargar modelos en memoria
42
+ llms = load_all_models()
43
 
44
  class ChatRequest(BaseModel):
45
  message: str
 
64
  def normalize_input(input_text):
65
  return input_text.strip()
66
 
67
+ def filter_duplicates(responses):
68
+ seen = set()
69
+ unique_responses = []
70
+ for response in responses:
71
+ lines = response.split('\n')
72
+ unique_lines = set()
73
+ for line in lines:
74
+ if line not in seen:
75
+ seen.add(line)
76
+ unique_lines.add(line)
77
+ unique_responses.append('\n'.join(unique_lines))
78
+ return unique_responses
79
+
80
  def select_best_response(responses):
81
+ # Eliminar respuestas duplicadas
82
+ unique_responses = filter_duplicates(responses)
83
  # Deduplicar respuestas
84
+ unique_responses = list(set(unique_responses))
85
  # Filtrar respuestas coherentes
86
  coherent_responses = filter_by_coherence(unique_responses)
87
  # Seleccionar la mejor respuesta
 
109
 
110
  print(f"Procesando solicitud: {request.message}")
111
 
112
+ # Utilizar un ProcessPoolExecutor para procesar los modelos en paralelo
113
+ def worker_function(llm):
114
+ return generate_chat_response(request, llm)
115
+
116
+ with ProcessPoolExecutor() as executor:
117
+ futures = [executor.submit(worker_function, llm) for llm in llms.values()]
118
  responses = []
119
 
120
  for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas"):
121
  response = future.result()
122
+ responses.append(response['response'])
123
  print(f"Modelo procesado: {response['literal'][:30]}...")
124
 
 
 
 
 
 
 
 
 
 
125
  # Seleccionar la mejor respuesta
126
+ best_response = select_best_response(responses)
127
 
128
  print(f"Mejor respuesta seleccionada: {best_response}")
129
 
130
  return {
131
  "best_response": best_response,
132
+ "all_responses": responses
133
  }
134
 
135
  if __name__ == "__main__":