ryanzhangfan
commited on
Update README.md
Browse files
README.md
CHANGED
@@ -51,11 +51,11 @@ from transformers.generation import LogitsProcessorList, PrefixConstrainedLogits
|
|
51 |
import torch
|
52 |
|
53 |
import sys
|
54 |
-
sys.path.append(PATH_TO_BAAI_Emu3-
|
55 |
from processing_emu3 import Emu3Processor
|
56 |
|
57 |
# model path
|
58 |
-
EMU_HUB = "BAAI/Emu3-
|
59 |
VQ_HUB = "BAAI/Emu3-VisionTokenizer"
|
60 |
|
61 |
# prepare model and processor
|
@@ -67,11 +67,12 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
67 |
trust_remote_code=True,
|
68 |
)
|
69 |
|
70 |
-
tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True)
|
71 |
image_processor = AutoImageProcessor.from_pretrained(VQ_HUB, trust_remote_code=True)
|
72 |
image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda:0", trust_remote_code=True).eval()
|
73 |
-
processor = Emu3Processor(image_processor, image_tokenizer, tokenizer)
|
74 |
|
|
|
75 |
# prepare input
|
76 |
POSITIVE_PROMPT = " masterpiece, film grained, best quality."
|
77 |
NEGATIVE_PROMPT = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
|
@@ -126,4 +127,29 @@ for idx, im in enumerate(mm_list):
|
|
126 |
continue
|
127 |
im.save(f"result_{idx}.png")
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
```
|
|
|
51 |
import torch
|
52 |
|
53 |
import sys
|
54 |
+
sys.path.append(PATH_TO_BAAI_Emu3-Stage1_MODEL)
|
55 |
from processing_emu3 import Emu3Processor
|
56 |
|
57 |
# model path
|
58 |
+
EMU_HUB = "BAAI/Emu3-Stage1"
|
59 |
VQ_HUB = "BAAI/Emu3-VisionTokenizer"
|
60 |
|
61 |
# prepare model and processor
|
|
|
67 |
trust_remote_code=True,
|
68 |
)
|
69 |
|
70 |
+
tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True, padding_side="left")
|
71 |
image_processor = AutoImageProcessor.from_pretrained(VQ_HUB, trust_remote_code=True)
|
72 |
image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda:0", trust_remote_code=True).eval()
|
73 |
+
processor = Emu3Processor(image_processor, image_tokenizer, tokenizer, chat_template="{image_prompt}{text_prompt}")
|
74 |
|
75 |
+
# Image Generation
|
76 |
# prepare input
|
77 |
POSITIVE_PROMPT = " masterpiece, film grained, best quality."
|
78 |
NEGATIVE_PROMPT = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
|
|
|
127 |
continue
|
128 |
im.save(f"result_{idx}.png")
|
129 |
|
130 |
+
|
131 |
+
# Multimodal Understanding
|
132 |
+
text = "The image depicts "
|
133 |
+
image = Image.open("assets/demo.png")
|
134 |
+
|
135 |
+
inputs = processor(
|
136 |
+
text=text,
|
137 |
+
image=image,
|
138 |
+
mode='U',
|
139 |
+
padding="longest",
|
140 |
+
return_tensors="pt",
|
141 |
+
)
|
142 |
+
GENERATION_CONFIG = GenerationConfig(pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)
|
143 |
+
|
144 |
+
outputs = model.generate(
|
145 |
+
inputs.input_ids.to("cuda:0"),
|
146 |
+
GENERATION_CONFIG,
|
147 |
+
max_new_tokens=1024,
|
148 |
+
attention_mask=inputs.attention_mask.to("cuda:0"),
|
149 |
+
)
|
150 |
+
outputs = outputs[:, inputs.input_ids.shape[-1]:]
|
151 |
+
answers = processor.batch_decode(outputs, skip_special_tokens=True)
|
152 |
+
for ans in answers:
|
153 |
+
print(ans)
|
154 |
+
|
155 |
```
|