BAAI
/

ryanzhangfan commited on
Commit
3578624
·
verified ·
1 Parent(s): 36cf688

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +30 -4
README.md CHANGED
@@ -51,11 +51,11 @@ from transformers.generation import LogitsProcessorList, PrefixConstrainedLogits
51
  import torch
52
 
53
  import sys
54
- sys.path.append(PATH_TO_BAAI_Emu3-Gen_MODEL)
55
  from processing_emu3 import Emu3Processor
56
 
57
  # model path
58
- EMU_HUB = "BAAI/Emu3-Gen"
59
  VQ_HUB = "BAAI/Emu3-VisionTokenizer"
60
 
61
  # prepare model and processor
@@ -67,11 +67,12 @@ model = AutoModelForCausalLM.from_pretrained(
67
  trust_remote_code=True,
68
  )
69
 
70
- tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True)
71
  image_processor = AutoImageProcessor.from_pretrained(VQ_HUB, trust_remote_code=True)
72
  image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda:0", trust_remote_code=True).eval()
73
- processor = Emu3Processor(image_processor, image_tokenizer, tokenizer)
74
 
 
75
  # prepare input
76
  POSITIVE_PROMPT = " masterpiece, film grained, best quality."
77
  NEGATIVE_PROMPT = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
@@ -126,4 +127,29 @@ for idx, im in enumerate(mm_list):
126
  continue
127
  im.save(f"result_{idx}.png")
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  ```
 
51
  import torch
52
 
53
  import sys
54
+ sys.path.append(PATH_TO_BAAI_Emu3-Stage1_MODEL)
55
  from processing_emu3 import Emu3Processor
56
 
57
  # model path
58
+ EMU_HUB = "BAAI/Emu3-Stage1"
59
  VQ_HUB = "BAAI/Emu3-VisionTokenizer"
60
 
61
  # prepare model and processor
 
67
  trust_remote_code=True,
68
  )
69
 
70
+ tokenizer = AutoTokenizer.from_pretrained(EMU_HUB, trust_remote_code=True, padding_side="left")
71
  image_processor = AutoImageProcessor.from_pretrained(VQ_HUB, trust_remote_code=True)
72
  image_tokenizer = AutoModel.from_pretrained(VQ_HUB, device_map="cuda:0", trust_remote_code=True).eval()
73
+ processor = Emu3Processor(image_processor, image_tokenizer, tokenizer, chat_template="{image_prompt}{text_prompt}")
74
 
75
+ # Image Generation
76
  # prepare input
77
  POSITIVE_PROMPT = " masterpiece, film grained, best quality."
78
  NEGATIVE_PROMPT = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
 
127
  continue
128
  im.save(f"result_{idx}.png")
129
 
130
+
131
+ # Multimodal Understanding
132
+ text = "The image depicts "
133
+ image = Image.open("assets/demo.png")
134
+
135
+ inputs = processor(
136
+ text=text,
137
+ image=image,
138
+ mode='U',
139
+ padding="longest",
140
+ return_tensors="pt",
141
+ )
142
+ GENERATION_CONFIG = GenerationConfig(pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id)
143
+
144
+ outputs = model.generate(
145
+ inputs.input_ids.to("cuda:0"),
146
+ GENERATION_CONFIG,
147
+ max_new_tokens=1024,
148
+ attention_mask=inputs.attention_mask.to("cuda:0"),
149
+ )
150
+ outputs = outputs[:, inputs.input_ids.shape[-1]:]
151
+ answers = processor.batch_decode(outputs, skip_special_tokens=True)
152
+ for ans in answers:
153
+ print(ans)
154
+
155
  ```