File size: 2,517 Bytes
8f558df
 
21fcfe6
8f558df
 
710ab17
21fcfe6
710ab17
6c90e3e
 
 
2dacd49
 
710ab17
21fcfe6
8f558df
21fcfe6
710ab17
 
 
21fcfe6
 
 
 
 
 
 
 
 
 
 
 
 
8f558df
21fcfe6
6c90e3e
710ab17
 
b959c42
710ab17
 
8f558df
710ab17
 
 
 
 
8f558df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21fcfe6
8f558df
 
 
 
21fcfe6
 
8f558df
b959c42
710ab17
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image

# Model ve işlemci yükleme
models = {
    "microsoft/Phi-3.5-vision-instruct": AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-3.5-vision-instruct", 
        trust_remote_code=True, 
        torch_dtype=torch.float32,  # CPU üzerinde çalıştığı için float32 kullanılıyor
        device_map="auto",  # FlashAttention2 kontrolünü devre dışı bırakır
        low_cpu_mem_usage=True  # Daha az bellek kullanımı sağlar
    ).eval()
}

processors = {
    "microsoft/Phi-3.5-vision-instruct": AutoProcessor.from_pretrained(
        "microsoft/Phi-3.5-vision-instruct", trust_remote_code=True
    )
}

DESCRIPTION = "[Phi-3.5-vision Demo](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)"

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

def run_example(image, text_input=None, model_id="microsoft/Phi-3.5-vision-instruct"):
    model = models[model_id]
    processor = processors[model_id]

    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt")  # Varsayılan olarak CPU kullanılır
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=2048,
        eos_token_id=processor.tokenizer.eos_token_id,
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    return response

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(DESCRIPTION)
    with gr.Tab(label="Phi-3.5 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

demo.queue(api_open=True)
demo.launch(debug=True, show_api=False)