Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from pydantic import BaseModel
|
3 |
from llama_cpp import Llama
|
4 |
-
from concurrent.futures import
|
5 |
import uvicorn
|
6 |
from dotenv import load_dotenv
|
7 |
from difflib import SequenceMatcher
|
8 |
from tqdm import tqdm
|
|
|
9 |
|
10 |
load_dotenv()
|
11 |
|
@@ -19,9 +20,26 @@ models = [
|
|
19 |
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
|
20 |
]
|
21 |
|
22 |
-
#
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
class ChatRequest(BaseModel):
|
27 |
message: str
|
@@ -46,9 +64,24 @@ def generate_chat_response(request, llm):
|
|
46 |
def normalize_input(input_text):
|
47 |
return input_text.strip()
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def select_best_response(responses):
|
|
|
|
|
50 |
# Deduplicar respuestas
|
51 |
-
unique_responses = list(set(
|
52 |
# Filtrar respuestas coherentes
|
53 |
coherent_responses = filter_by_coherence(unique_responses)
|
54 |
# Seleccionar la mejor respuesta
|
@@ -76,33 +109,27 @@ async def generate_chat(request: ChatRequest):
|
|
76 |
|
77 |
print(f"Procesando solicitud: {request.message}")
|
78 |
|
79 |
-
# Utilizar un
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
82 |
responses = []
|
83 |
|
84 |
for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas"):
|
85 |
response = future.result()
|
86 |
-
responses.append(response)
|
87 |
print(f"Modelo procesado: {response['literal'][:30]}...")
|
88 |
|
89 |
-
# Extraer respuestas de los diccionarios
|
90 |
-
response_texts = [resp['response'] for resp in responses]
|
91 |
-
|
92 |
-
# Verificar si hay errores en las respuestas
|
93 |
-
error_responses = [resp for resp in responses if "Error" in resp['response']]
|
94 |
-
if error_responses:
|
95 |
-
error_response = error_responses[0]
|
96 |
-
raise HTTPException(status_code=500, detail=error_response['response'])
|
97 |
-
|
98 |
# Seleccionar la mejor respuesta
|
99 |
-
best_response = select_best_response(
|
100 |
|
101 |
print(f"Mejor respuesta seleccionada: {best_response}")
|
102 |
|
103 |
return {
|
104 |
"best_response": best_response,
|
105 |
-
"all_responses":
|
106 |
}
|
107 |
|
108 |
if __name__ == "__main__":
|
|
|
1 |
from fastapi import FastAPI, HTTPException
|
2 |
from pydantic import BaseModel
|
3 |
from llama_cpp import Llama
|
4 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
5 |
import uvicorn
|
6 |
from dotenv import load_dotenv
|
7 |
from difflib import SequenceMatcher
|
8 |
from tqdm import tqdm
|
9 |
+
import multiprocessing
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
20 |
{"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf"},
|
21 |
]
|
22 |
|
23 |
+
# Función para cargar un modelo
|
24 |
+
def load_model(model_config):
|
25 |
+
return Llama.from_pretrained(repo_id=model_config['repo_id'], filename=model_config['filename'])
|
26 |
+
|
27 |
+
# Cargar modelos en paralelo
|
28 |
+
def load_all_models():
|
29 |
+
with ProcessPoolExecutor() as executor:
|
30 |
+
future_to_model = {executor.submit(load_model, model): model for model in models}
|
31 |
+
loaded_models = {}
|
32 |
+
for future in as_completed(future_to_model):
|
33 |
+
model = future_to_model[future]
|
34 |
+
try:
|
35 |
+
loaded_models[model['repo_id']] = future.result()
|
36 |
+
print(f"Modelo cargado en RAM: {model['repo_id']}")
|
37 |
+
except Exception as exc:
|
38 |
+
print(f"Error al cargar modelo {model['repo_id']}: {exc}")
|
39 |
+
return loaded_models
|
40 |
+
|
41 |
+
# Cargar modelos en memoria
|
42 |
+
llms = load_all_models()
|
43 |
|
44 |
class ChatRequest(BaseModel):
|
45 |
message: str
|
|
|
64 |
def normalize_input(input_text):
|
65 |
return input_text.strip()
|
66 |
|
67 |
+
def filter_duplicates(responses):
|
68 |
+
seen = set()
|
69 |
+
unique_responses = []
|
70 |
+
for response in responses:
|
71 |
+
lines = response.split('\n')
|
72 |
+
unique_lines = set()
|
73 |
+
for line in lines:
|
74 |
+
if line not in seen:
|
75 |
+
seen.add(line)
|
76 |
+
unique_lines.add(line)
|
77 |
+
unique_responses.append('\n'.join(unique_lines))
|
78 |
+
return unique_responses
|
79 |
+
|
80 |
def select_best_response(responses):
|
81 |
+
# Eliminar respuestas duplicadas
|
82 |
+
unique_responses = filter_duplicates(responses)
|
83 |
# Deduplicar respuestas
|
84 |
+
unique_responses = list(set(unique_responses))
|
85 |
# Filtrar respuestas coherentes
|
86 |
coherent_responses = filter_by_coherence(unique_responses)
|
87 |
# Seleccionar la mejor respuesta
|
|
|
109 |
|
110 |
print(f"Procesando solicitud: {request.message}")
|
111 |
|
112 |
+
# Utilizar un ProcessPoolExecutor para procesar los modelos en paralelo
|
113 |
+
def worker_function(llm):
|
114 |
+
return generate_chat_response(request, llm)
|
115 |
+
|
116 |
+
with ProcessPoolExecutor() as executor:
|
117 |
+
futures = [executor.submit(worker_function, llm) for llm in llms.values()]
|
118 |
responses = []
|
119 |
|
120 |
for future in tqdm(as_completed(futures), total=len(futures), desc="Generando respuestas"):
|
121 |
response = future.result()
|
122 |
+
responses.append(response['response'])
|
123 |
print(f"Modelo procesado: {response['literal'][:30]}...")
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
# Seleccionar la mejor respuesta
|
126 |
+
best_response = select_best_response(responses)
|
127 |
|
128 |
print(f"Mejor respuesta seleccionada: {best_response}")
|
129 |
|
130 |
return {
|
131 |
"best_response": best_response,
|
132 |
+
"all_responses": responses
|
133 |
}
|
134 |
|
135 |
if __name__ == "__main__":
|