Llama-Preview-abliterated-gguf-demo

Runtime error

App Files Files Community

Hjgugugjhuhjggg commited on Dec 1, 2024

Commit

c875a14

verified ·

1 Parent(s): 543d52b

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -15

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import uvicorn
-from fastapi import FastAPI, HTTPException, Request
 import os
 from dotenv import load_dotenv
 from pydantic import BaseModel
@@ -15,16 +15,20 @@ from faker import Faker
 import gradio as gr
 from threading import Thread
 nltk.download('punkt')
 nltk.download('stopwords')
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
 fake = Faker()
 global_data = {
     'models': {},
     'tokens': {
@@ -45,31 +49,32 @@ global_data = {
     'model_params': {},
 }
 model_configs = [
     {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "meta-llama-3.1-70b", "seed": 42, "n_ctx": 1024},
     {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "gemma-2-27b", "seed": 42, "n_ctx": 1024}
 ]
 def load_model(model_config):
     model_name = model_config['name']
     if model_name not in global_data['models']:
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
-            # Manually define the context parameters
             context_params = {
                 "seed": model_config.get('seed', 42),
                 "n_ctx": model_config.get('n_ctx', 1024)
             }
-            # Initialize the model with context parameters
             model = Llama.from_pretrained(
                 repo_id=model_config['repo_id'],
                 filename=model_config['filename'],
                 use_auth_token=HUGGINGFACE_TOKEN,
                 verbose=True,
                 device=device,
-                context_params=context_params  # Pass context params directly
             )
             global_data['models'][model_name] = model
@@ -79,11 +84,11 @@ def load_model(model_config):
             logging.critical(f"CRITICAL ERROR loading model '{model_name}': {e}", exc_info=True)
             return None
-# Load all models
 for config in model_configs:
     load_model(config)
-# Class for the incoming request
 class ChatRequest(BaseModel):
     message: str
@@ -91,7 +96,7 @@ class ChatRequest(BaseModel):
 def normalize_input(input_text):
     return input_text.strip()
-# Function to remove duplicate sentences based on similarity
 def remove_duplicates(text, similarity_threshold=0.85):
     sentences = sent_tokenize(text)
     unique_sentences = []
@@ -106,12 +111,13 @@ def remove_duplicates(text, similarity_threshold=0.85):
             unique_sentences.append(sentence)
     return " ".join(unique_sentences)
-# GPU task function with error handling and CPU fallback
 @spaces.GPU(duration=0)
 def generate_model_response(model, inputs, model_config):
     try:
         if model is None:
             return []
         responses = []
         model_metadata = global_data['model_metadata'].get(model_config['name'], {})
         stop_tokens = [global_data['tokens'].get('eos', '<|end_of_text|>')]
@@ -140,7 +146,7 @@ def generate_model_response(model, inputs, model_config):
 # FastAPI app
 app = FastAPI()
-# POST endpoint to handle chat requests
 @app.post("/chat")
 async def chat(request: ChatRequest):
     input_text = normalize_input(request.message)
@@ -152,7 +158,7 @@ async def chat(request: ChatRequest):
     response = generate_model_response(model_instance, input_text, model_configs[0])
     return {"response": response[0] if response else "No response generated."}
-# Gradio Interface for testing the model
 def gradio_interface(input_text):
     model_name = "meta-llama-3.1-70b"
     model_instance = global_data['models'].get(model_name, None)
@@ -161,14 +167,19 @@ def gradio_interface(input_text):
     response = generate_model_response(model_instance, input_text, model_configs[0])
     return response[0] if response else "No response generated."
-# Gradio Interface setup
 def start_gradio_interface():
     gr.Interface(fn=gradio_interface, inputs="text", outputs="text").launch(share=True)
-# Run Gradio in a separate thread to avoid blocking FastAPI
-gradio_thread = Thread(target=start_gradio_interface)
-gradio_thread.start()
-# Run the FastAPI app
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)

 from llama_cpp import Llama
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import uvicorn
+from fastapi import FastAPI, HTTPException
 import os
 from dotenv import load_dotenv
 from pydantic import BaseModel
 import gradio as gr
 from threading import Thread
+# Download NLTK resources
 nltk.download('punkt')
 nltk.download('stopwords')
+# Load environment variables from .env file
 load_dotenv()
 HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+# Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s')
 fake = Faker()
+# Global data structure to hold models and configurations
 global_data = {
     'models': {},
     'tokens': {
     'model_params': {},
 }
+# Model configurations
 model_configs = [
     {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "meta-llama-3.1-70b-q2_k.gguf", "name": "meta-llama-3.1-70b", "seed": 42, "n_ctx": 1024},
     {"repo_id": "Ffftdtd5dtft/gemma-2-27b-Q2_K-GGUF", "filename": "gemma-2-27b-q2_k.gguf", "name": "gemma-2-27b", "seed": 42, "n_ctx": 1024}
 ]
+# Function to load model
 def load_model(model_config):
     model_name = model_config['name']
     if model_name not in global_data['models']:
         try:
             device = "cuda" if torch.cuda.is_available() else "cpu"
             context_params = {
                 "seed": model_config.get('seed', 42),
                 "n_ctx": model_config.get('n_ctx', 1024)
             }
+            # Initialize model
             model = Llama.from_pretrained(
                 repo_id=model_config['repo_id'],
                 filename=model_config['filename'],
                 use_auth_token=HUGGINGFACE_TOKEN,
                 verbose=True,
                 device=device,
+                context_params=context_params
             )
             global_data['models'][model_name] = model
             logging.critical(f"CRITICAL ERROR loading model '{model_name}': {e}", exc_info=True)
             return None
+# Load all models at the start
 for config in model_configs:
     load_model(config)
+# Pydantic model to validate incoming requests
 class ChatRequest(BaseModel):
     message: str
 def normalize_input(input_text):
     return input_text.strip()
+# Function to remove duplicate sentences
 def remove_duplicates(text, similarity_threshold=0.85):
     sentences = sent_tokenize(text)
     unique_sentences = []
             unique_sentences.append(sentence)
     return " ".join(unique_sentences)
+# Function to handle model response generation with GPU fallback
 @spaces.GPU(duration=0)
 def generate_model_response(model, inputs, model_config):
     try:
         if model is None:
             return []
         responses = []
         model_metadata = global_data['model_metadata'].get(model_config['name'], {})
         stop_tokens = [global_data['tokens'].get('eos', '<|end_of_text|>')]
 # FastAPI app
 app = FastAPI()
+# FastAPI POST endpoint to handle chat requests
 @app.post("/chat")
 async def chat(request: ChatRequest):
     input_text = normalize_input(request.message)
     response = generate_model_response(model_instance, input_text, model_configs[0])
     return {"response": response[0] if response else "No response generated."}
+# Gradio interface for model testing
 def gradio_interface(input_text):
     model_name = "meta-llama-3.1-70b"
     model_instance = global_data['models'].get(model_name, None)
     response = generate_model_response(model_instance, input_text, model_configs[0])
     return response[0] if response else "No response generated."
+# Gradio interface setup
 def start_gradio_interface():
     gr.Interface(fn=gradio_interface, inputs="text", outputs="text").launch(share=True)
+# Run Gradio in a separate thread
+def start_gradio():
+    gradio_thread = Thread(target=start_gradio_interface)
+    gradio_thread.daemon = True  # Ensures the thread will exit when the main program exits
+    gradio_thread.start()
+# Start the Gradio interface
+start_gradio()
+# Run FastAPI app using uvicorn
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=7860)