Ravinandan commited on
Commit
dcfbbe4
1 Parent(s): 0823dd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -21
app.py CHANGED
@@ -1,11 +1,8 @@
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
3
  import gradio as gr
4
 
5
- # Set environment variable for PyTorch CUDA memory management
6
- import os
7
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
8
-
9
  # Load the model and tokenizer
10
  model = AutoModelForCausalLM.from_pretrained(
11
  "qresearch/llama-3.1-8B-vision-378",
@@ -17,21 +14,9 @@ tokenizer = AutoTokenizer.from_pretrained("qresearch/llama-3.1-8B-vision-378", u
17
 
18
  # Define the function to process the image and instruction
19
  def describe_image(image, instruction):
20
- # Convert the image to text representation (assuming image processing is handled)
21
- inputs = tokenizer(instruction, return_tensors="pt").to("cuda")
22
-
23
- # Generate a description using the model
24
- with torch.no_grad(): # Avoid storing gradients to save memory
25
- outputs = model.generate(
26
- **inputs,
27
- max_new_tokens=128,
28
- do_sample=True,
29
- temperature=0.3
30
- )
31
-
32
- # Decode the generated tokens to a string
33
- description = tokenizer.decode(outputs[0], skip_special_tokens=True)
34
-
35
  return description
36
 
37
  # Create the Gradio interface
@@ -42,9 +27,9 @@ interface = gr.Interface(
42
  gr.Textbox(placeholder="Enter your instruction here...", label="Instruction") # Input for the instruction
43
  ],
44
  outputs="text", # Output is text (the description)
45
- title="LLaMA 3.1 with Vision",
46
  description="Upload an image and enter an instruction to generate a description based on the provided instruction."
47
  )
48
 
49
  # Launch the Gradio app
50
- interface.launch()
 
1
  import torch
2
  from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from PIL import Image
4
  import gradio as gr
5
 
 
 
 
 
6
  # Load the model and tokenizer
7
  model = AutoModelForCausalLM.from_pretrained(
8
  "qresearch/llama-3.1-8B-vision-378",
 
14
 
15
  # Define the function to process the image and instruction
16
  def describe_image(image, instruction):
17
+ description = model.answer_question(
18
+ image, instruction, tokenizer, max_new_tokens=128, do_sample=True, temperature=0.3
19
+ )
 
 
 
 
 
 
 
 
 
 
 
 
20
  return description
21
 
22
  # Create the Gradio interface
 
27
  gr.Textbox(placeholder="Enter your instruction here...", label="Instruction") # Input for the instruction
28
  ],
29
  outputs="text", # Output is text (the description)
30
+ title="LLaMA 3.1 with vision",
31
  description="Upload an image and enter an instruction to generate a description based on the provided instruction."
32
  )
33
 
34
  # Launch the Gradio app
35
+ interface.launch()