wifix199 commited on
Commit
76084c0
·
verified ·
1 Parent(s): b90bc2b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -52
app.py CHANGED
@@ -1,57 +1,44 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
- import os
4
-
5
- """
6
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/en/guides/inference
7
- """
8
-
9
- # Retrieve the Hugging Face token
10
- hf_token = os.environ.get("HF_TOKEN")
11
- if not hf_token:
12
- raise ValueError("Please set the HF_TOKEN environment variable with your Hugging Face API token.")
13
-
14
- # Initialize the InferenceClient with a correct model
15
- client = InferenceClient("models/meta-llama/Llama-3.2-1B", token=hf_token)
16
-
17
- def respond(
18
- message,
19
- history: list[tuple[str, str]],
20
- system_message,
21
- max_tokens,
22
- temperature,
23
- top_p,
24
- ):
25
- messages = [{"role": "system", "content": system_message}]
26
-
27
- for user_input, assistant_response in history:
28
- if user_input:
29
- messages.append({"role": "user", "content": user_input})
30
- if assistant_response:
31
- messages.append({"role": "assistant", "content": assistant_response})
32
-
33
- messages.append({"role": "user", "content": message})
34
 
35
- response = ""
 
 
 
 
 
 
 
 
36
 
37
- # Start the chat completion
38
- try:
39
- for msg in client.chat_completion(
40
- messages=messages,
41
- max_new_tokens=max_tokens,
42
- stream=True,
43
- temperature=temperature,
44
- top_p=top_p,
45
- ):
46
- token = msg.delta.get("content", "")
47
- response += token
48
- yield response
49
- except Exception as e:
50
- yield f"Error during inference: {e}"
 
 
 
51
 
52
- """
53
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
54
- """
55
  demo = gr.ChatInterface(
56
  fn=respond,
57
  additional_inputs=[
@@ -66,8 +53,8 @@ demo = gr.ChatInterface(
66
  label="Top-p (nucleus sampling)",
67
  ),
68
  ],
69
- title="Chat with Llama 2",
70
- description="A chat interface using Llama 2 model via Hugging Face Inference API.",
71
  )
72
 
73
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
+ import torch
4
+
5
+ # Load the model and tokenizer
6
+ model_name = "meta-llama/Llama-2-7b-chat-hf"
7
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
8
+ model = AutoModelForCausalLM.from_pretrained(
9
+ model_name,
10
+ device_map="auto",
11
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
12
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # Initialize the pipeline
15
+ generator = pipeline(
16
+ "text-generation",
17
+ model=model,
18
+ tokenizer=tokenizer,
19
+ device_map="auto",
20
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
21
+ max_new_tokens=512,
22
+ )
23
 
24
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
25
+ prompt = f"{system_message}\n"
26
+ for user_msg, assistant_msg in history:
27
+ prompt += f"User: {user_msg}\nAssistant: {assistant_msg}\n"
28
+ prompt += f"User: {message}\nAssistant:"
29
+
30
+ response = generator(
31
+ prompt,
32
+ max_new_tokens=max_tokens,
33
+ temperature=temperature,
34
+ top_p=top_p,
35
+ do_sample=True,
36
+ )[0]['generated_text']
37
+
38
+ assistant_response = response.replace(prompt, "").strip()
39
+ history.append((message, assistant_response))
40
+ return assistant_response, history
41
 
 
 
 
42
  demo = gr.ChatInterface(
43
  fn=respond,
44
  additional_inputs=[
 
53
  label="Top-p (nucleus sampling)",
54
  ),
55
  ],
56
+ title="Chat with LLaMA 2",
57
+ description="A chat interface using LLaMA 2 model locally via Transformers.",
58
  )
59
 
60
  if __name__ == "__main__":