Spaces:

wifix199
/

LumiVoice

Running

App Files Files Community

wifix199 commited on Dec 5, 2024

Commit

76084c0

verified ·

1 Parent(s): b90bc2b

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -52

app.py CHANGED Viewed

@@ -1,57 +1,44 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-import os
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/en/guides/inference
-"""
-# Retrieve the Hugging Face token
-hf_token = os.environ.get("HF_TOKEN")
-if not hf_token:
-    raise ValueError("Please set the HF_TOKEN environment variable with your Hugging Face API token.")
-# Initialize the InferenceClient with a correct model
-client = InferenceClient("models/meta-llama/Llama-3.2-1B", token=hf_token)
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for user_input, assistant_response in history:
-        if user_input:
-            messages.append({"role": "user", "content": user_input})
-        if assistant_response:
-            messages.append({"role": "assistant", "content": assistant_response})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    # Start the chat completion
-    try:
-        for msg in client.chat_completion(
-            messages=messages,
-            max_new_tokens=max_tokens,
-            stream=True,
-            temperature=temperature,
-            top_p=top_p,
-        ):
-            token = msg.delta.get("content", "")
-            response += token
-            yield response
-    except Exception as e:
-        yield f"Error during inference: {e}"
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
@@ -66,8 +53,8 @@ demo = gr.ChatInterface(
             label="Top-p (nucleus sampling)",
         ),
     ],
-    title="Chat with Llama 2",
-    description="A chat interface using Llama 2 model via Hugging Face Inference API.",
 )
 if __name__ == "__main__":

 import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import torch
+# Load the model and tokenizer
+model_name = "meta-llama/Llama-2-7b-chat-hf"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+)
+# Initialize the pipeline
+generator = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+    max_new_tokens=512,
+)
+def respond(message, history, system_message, max_tokens, temperature, top_p):
+    prompt = f"{system_message}\n"
+    for user_msg, assistant_msg in history:
+        prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
+    prompt += f"User: {message}\nAssistant:"
+    response = generator(
+        prompt,
+        max_new_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        do_sample=True,
+    )[0]['generated_text']
+    assistant_response = response.replace(prompt, "").strip()
+    history.append((message, assistant_response))
+    return assistant_response, history
 demo = gr.ChatInterface(
     fn=respond,
     additional_inputs=[
             label="Top-p (nucleus sampling)",
         ),
     ],
+    title="Chat with LLaMA 2",
+    description="A chat interface using LLaMA 2 model locally via Transformers.",
 )
 if __name__ == "__main__":