Test-Caption-Captain

Sleeping

App Files Files Community

Severian commited on Sep 25, 2024

Commit

9bc81e0

verified ·

1 Parent(s): 5bbe627

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -80

app.py CHANGED Viewed

@@ -261,90 +261,115 @@ image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", m
 image_adapter.eval()
 image_adapter.to("cuda")
 @spaces.GPU()
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
-	torch.cuda.empty_cache()
-	# 'any' means no length specified
-	length = None if caption_length == "any" else caption_length
-	if isinstance(length, str):
-		try:
-			length = int(length)
-		except ValueError:
-			pass
-	# 'rng-tags' and 'training_prompt' don't have formal/informal tones
-	if caption_type == "rng-tags" or caption_type == "training_prompt":
-		caption_tone = "formal"
-	# Build prompt
-	prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
-	if prompt_key not in CAPTION_TYPE_MAP:
-		raise ValueError(f"Invalid caption type: {prompt_key}")
-	prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
-		length=length,
-		word_count=length,
-		style=art_style,
-		style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
-		style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
-	)
-	print(f"Prompt: {prompt_str}")
-	# Preprocess image
-	#image = clip_processor(images=input_image, return_tensors='pt').pixel_values
-	image = input_image.resize((384, 384), Image.LANCZOS)
-	pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-	pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
-	pixel_values = pixel_values.to('cuda')
-	# Tokenize the prompt
-	prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
-	# Embed image
-	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
-		vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
-		image_features = vision_outputs.hidden_states
-		embedded_images = image_adapter(image_features)
-		embedded_images = embedded_images.to('cuda')
-	# Embed prompt
-	prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-	assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
-	embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
-	eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-	# Construct prompts
-	inputs_embeds = torch.cat([
-		embedded_bos.expand(embedded_images.shape[0], -1, -1),
-		embedded_images.to(dtype=embedded_bos.dtype),
-		prompt_embeds.expand(embedded_images.shape[0], -1, -1),
-		eot_embed.expand(embedded_images.shape[0], -1, -1),
-	], dim=1)
-	input_ids = torch.cat([
-		torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
-		torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
-		prompt,
-		torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
-	], dim=1).to('cuda')
-	attention_mask = torch.ones_like(input_ids)
-	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=False, suppress_tokens=None)
-	#generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, top_k=10, temperature=0.5, suppress_tokens=None)
-	generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)   # Uses the default which is temp=0.6, top_p=0.9
-	# Trim off the prompt
-	generate_ids = generate_ids[:, input_ids.shape[1]:]
-	if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
-		generate_ids = generate_ids[:, :-1]
-	caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
-	return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {

 image_adapter.eval()
 image_adapter.to("cuda")
+# After loading the tokenizer and model
+print(f"Tokenizer class: {type(tokenizer)}")
+print(f"BOS token: {tokenizer.bos_token}")
+print(f"BOS token ID: {tokenizer.bos_token_id}")
+print(f"EOS token: {tokenizer.eos_token}")
+print(f"EOS token ID: {tokenizer.eos_token_id}")
+print(f"Text model device: {text_model.device}")
+# Ensure the tokenizer has the necessary special tokens
+if tokenizer.bos_token_id is None or tokenizer.eos_token_id is None:
+    print("Warning: BOS or EOS token is missing. Adding default tokens.")
+    special_tokens_dict = {}
+    if tokenizer.bos_token_id is None:
+        special_tokens_dict['bos_token'] = '<|endoftext|>'
+    if tokenizer.eos_token_id is None:
+        special_tokens_dict['eos_token'] = '<|endoftext|>'
+    num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    print(f"Added {num_added_tokens} special tokens to the tokenizer.")
+    # Resize token embeddings of the model if new tokens are added
+    text_model.resize_token_embeddings(len(tokenizer))
 @spaces.GPU()
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_tone: str, caption_length: str | int, art_style: str) -> str:
+    torch.cuda.empty_cache()
+    # 'any' means no length specified
+    length = None if caption_length == "any" else caption_length
+    if isinstance(length, str):
+        try:
+            length = int(length)
+        except ValueError:
+            pass
+    # 'rng-tags' and 'training_prompt' don't have formal/informal tones
+    if caption_type == "rng-tags" or caption_type == "training_prompt":
+        caption_tone = "formal"
+    # Build prompt
+    prompt_key = (caption_type, caption_tone, isinstance(length, str), isinstance(length, int))
+    if prompt_key not in CAPTION_TYPE_MAP:
+        raise ValueError(f"Invalid caption type: {prompt_key}")
+    prompt_str = CAPTION_TYPE_MAP[prompt_key][0].format(
+        length=length,
+        word_count=length,
+        style=art_style,
+        style_characteristics=STYLE_CHARACTERISTICS.get(art_style, "its unique elements"),
+        style_focus=STYLE_FOCUS.get(art_style, "its distinctive features")
+    )
+    print(f"Prompt: {prompt_str}")
+    # Preprocess image
+    image = input_image.resize((384, 384), Image.LANCZOS)
+    pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+    pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+    pixel_values = pixel_values.to('cuda')
+    # Tokenize the prompt
+    prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
+    # Embed image
+    with torch.amp.autocast_mode.autocast('cuda', enabled=True):
+        vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
+        image_features = vision_outputs.hidden_states
+        embedded_images = image_adapter(image_features)
+        embedded_images = embedded_images.to('cuda')
+    # Embed prompt
+    prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+    assert prompt_embeds.shape == (1, prompt.shape[1], text_model.config.hidden_size), f"Prompt shape is {prompt_embeds.shape}, expected {(1, prompt.shape[1], text_model.config.hidden_size)}"
+    # Check for bos_token_id and provide a fallback
+    bos_token_id = tokenizer.bos_token_id
+    if bos_token_id is None:
+        print("Warning: bos_token_id is None. Using default value of 1.")
+        bos_token_id = 1  # Common default, but may need adjustment
+    embedded_bos = text_model.model.embed_tokens(torch.tensor([[bos_token_id]], device=text_model.device, dtype=torch.int64))
+    eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
+    # Construct prompts
+    inputs_embeds = torch.cat([
+        embedded_bos.expand(embedded_images.shape[0], -1, -1),
+        embedded_images.to(dtype=embedded_bos.dtype),
+        prompt_embeds.expand(embedded_images.shape[0], -1, -1),
+        eot_embed.expand(embedded_images.shape[0], -1, -1),
+    ], dim=1)
+    input_ids = torch.cat([
+        torch.tensor([[bos_token_id]], dtype=torch.long),
+        torch.zeros((1, embedded_images.shape[1]), dtype=torch.long),
+        prompt,
+        torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
+    ], dim=1).to('cuda')
+    attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=300, do_sample=True, suppress_tokens=None)
+    # Trim off the prompt
+    generate_ids = generate_ids[:, input_ids.shape[1]:]
+    if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
+        generate_ids = generate_ids[:, :-1]
+    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
+    return caption.strip()
 css = """
 h1, h2, h3, h4, h5, h6, p, li, ul, ol, a, .centered-image {