Spaces:

Kartheekb7
/

llava_chat

Sleeping

App Files Files Community

llava_chat / app.py

Kartheekb7

Update app.py

c267fe9 verified 6 months ago

raw

history blame contribute delete

11.8 kB

	import gradio as gr
	import base64
	from transformers import pipeline
	from transformers import CLIPProcessor, CLIPModel
	from PIL import Image
	# Load CLIP model and processor
	from transformers import GenerationConfig
	from peft import PeftModel, PeftConfig
	import torch.nn as nn
	import random
	import torch
	from transformers import AutoTokenizer
	import os

	from transformers import AutoModelForCausalLM, BitsAndBytesConfig,AutoTokenizer
	import torch

	hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN") # This retrieves the secret

	model_name = "meta-llama/Llama-3.2-1B-Instruct"
	print("loading_model")
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype = torch.float32,
	trust_remote_code=True,
	token=hf_token
	)
	print("loaded_model")
	tokenizer = AutoTokenizer.from_pretrained(model_name,token=hf_token)
	tokenizer.pad_token = tokenizer.eos_token
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	device
	select_feature = 'patch'
	def feature_select(image_forward_outs):
	image_features = image_forward_outs.hidden_states[-1]
	if select_feature == 'patch':
	image_features = image_features[:, 1:] # Skip CLS token if selecting patch
	elif select_feature == 'cls_patch':
	image_features = image_features # Keep CLS + patch tokens
	else:
	raise ValueError(f'Unexpected select feature: {select_feature}')
	return image_features

	class MLPProjection(nn.Module):
	def __init__(self, input_dim, output_dim, hidden_dim=768, depth=2):
	super(MLPProjection, self).__init__()
	modules = []
	modules.append(nn.Linear(input_dim, hidden_dim,bias = False))

	for _ in range(1, depth):
	modules.append(nn.GELU())
	modules.append(nn.Linear(hidden_dim, output_dim,bias=False))

	self.mlp = nn.Sequential(*modules)


	def forward(self, x):
	return self.mlp(x)

	class PHI2WithMLP(nn.Module):
	def __init__(self, phi2_model, mlp_projection):
	super(PHI2WithMLP, self).__init__()
	self.phi2_model = phi2_model
	self.mlp_projection = mlp_projection
	self.config = phi2_model.config

	def forward(self, image_embeddings=None,
	inputs_embeds=None,
	input_ids=None,
	attention_mask=None,
	labels=None,
	output_attentions=False,
	output_hidden_states=False,
	**kwargs): # Catch any additional arguments):

	if input_ids is not None:
	token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
	elif inputs_embeds is not None:
	token_embeddings = inputs_embeds
	else:
	raise ValueError("You must provide either input_ids or inputs_embeds.")


	if image_embeddings is not None:
	# Apply MLP to image embeddings to map to text embedding space
	projected_image_embeddings = self.mlp_projection(image_embeddings).to(device = token_embeddings.device)

	# Get the sequence length for the image embeddings
	image_embedding_length = projected_image_embeddings.size(1)

	batch_size, text_sequence_length = attention_mask.shape

	# Extend attention mask for image embeddings (ones for image embedding positions)
	new_attention_mask = torch.cat(
	[torch.ones((batch_size,image_embedding_length), device=attention_mask.device),attention_mask ], dim=1
	)

	# Combine image and token embeddings
	combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1) # Concatenating along sequence length

	else:
	# No image embeddings: Use only token embeddings and the original attention mask
	combined_embeddings = token_embeddings
	new_attention_mask = attention_mask
	if labels is not None:
	# Labels should match the sequence length of combined embeddings
	# If labels correspond only to text tokens, pad them to match the new sequence length
	if image_embeddings is not None:
	label_padding = torch.full(
	(batch_size, image_embedding_length), -100, device=labels.device # Use -100 for ignore index
	)
	new_labels = torch.cat([label_padding,labels], dim=1)
	else:
	new_labels = labels
	else:
	new_labels = labels
	# Pass the combined embeddings through the PHI2 model with the (updated or original) attention mask
	outputs = self.phi2_model(inputs_embeds=combined_embeddings, attention_mask=new_attention_mask,labels = new_labels, output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	**kwargs)

	return outputs

	def prepare_inputs_for_generation(self, input_ids, attention_mask=None, image_embeddings=None, **kwargs):
	# Generate inputs with projections where necessary
	if image_embeddings is not None:
	projected_image_embeddings = self.mlp_projection(image_embeddings)
	projected_image_embeddings = projected_image_embeddings.unsqueeze(0)

	token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)

	combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)
	image_embedding_length = projected_image_embeddings.size(1)

	image_embedding_length = projected_image_embeddings.size(1)

	batch_size, text_sequence_length = attention_mask.shape

	# Extend attention mask for image embeddings (ones for image embedding positions)
	new_attention_mask = torch.cat(
	[torch.ones((batch_size,image_embedding_length), device=attention_mask.device),attention_mask ], dim=1
	)


	else:
	combined_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
	new_attention_mask = attention_mask

	return {
	"inputs_embeds": combined_embeddings,
	"attention_mask": new_attention_mask,

	**kwargs
	}

	def generate(self, input_ids, attention_mask=None, image_embeddings=None, **kwargs):
	self.eval() # Set to evaluation mode
	# Prepare inputs for generation
	inputs = self.prepare_inputs_for_generation(input_ids, attention_mask, image_embeddings, **kwargs)
	# Use the model's built-in generate method
	return self.phi2_model.generate(**inputs)

	def create_phi2_model_with_lora(mlp_projection,lan_model):

	for param in mlp_projection.parameters():
	param.requires_grad = True

	# Return PHI2 model with MLP projection
	return PHI2WithMLP(lan_model, mlp_projection)

	model_embedding_dim = model.config.hidden_size # This might change based on your model architecture

	# Example usage
	input_dim = 768 # Input dimension of image embeddings
	output_dim = model_embedding_dim # Target dimension of text embeddings
	hidden_dim = 1024 # Hidden layer dimension of the MLP

	mlp_projection = MLPProjection(input_dim, output_dim, hidden_dim, depth=2).to(device) # Customize MLP
	combined_model = create_phi2_model_with_lora(mlp_projection, model)

	peft_model_id = "Kartheekb7/results1"
	loaded_model = PeftModel.from_pretrained(combined_model, peft_model_id)


	loaded_mlp_weights = torch.load("mlp_projection_weights.pth",map_location=torch.device('cpu'))
	loaded_model.base_model.model.mlp_projection.load_state_dict(loaded_mlp_weights)

	# Create a new GenerationConfig with desired settings
	generation_config = GenerationConfig(max_new_tokens=128, temperature=0.01, top_p=1)
	loaded_model.generation_config = generation_config


	clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
	processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

	pipe = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-small",
	chunk_length_s=30,
	device=device,
	)

	def image_to_base64(image_path):

	with open(image_path, 'rb') as img:
	encoded_string = base64.b64encode(img.read())
	return encoded_string.decode('utf-8')

	def audio_to_base64(audio_path):
	with open(audio_path, 'rb') as audio_file:
	encoded_string = base64.b64encode(audio_file.read())
	return encoded_string.decode('utf-8')

	def get_clip_embedding(image_path):
	image = Image.open(image_path)
	inputs = processor(images=image, return_tensors="pt", padding=True)
	with torch.no_grad():
	image_features = clip_model.get_image_features(**inputs)

	image_forward_outs = clip_model.vision_model(**inputs, output_hidden_states=True)
	image_features = feature_select(image_forward_outs)
	image_embedding = image_features.squeeze(0)
	return image_embedding

	def process_text(text_input):
	# Tokenize text input
	input_encoding = tokenizer(
	text_input,
	return_tensors='pt',
	# padding='max_length',
	truncation=True,
	# max_length=256-49 # Set this to match your model's input size
	)
	return input_encoding

	def audio_process(path_file):
	result = pipe(path_file,generate_kwargs={"language": "english"})
	return result['text']


	def chat(message, history, audio=None, image=None):
	image_embedding = None
	response = "" if message is None else message
	input_message = message

	if audio is not None:
	print("audio")
	base64_audio = audio_to_base64(audio)
	data_url = f"data:audio/wav;base64,{base64_audio}"
	input_message += f"<audio controls><source src='{data_url}' type='audio/wav'>Your browser does not support the audio element.</audio>"
	response += audio_process(audio)
	print("audio_processed")
	if image is not None:
	base64 = image_to_base64(image)
	data_url = f"data:image/jpeg;base64,{base64}"
	input_message += f" ![]({data_url})"
	image_embedding = get_clip_embedding(image)
	print("image_processed")

	input_encoding = process_text(response)
	print("inference start")
	outputs = loaded_model.generate(**input_encoding,image_embeddings = image_embedding, max_new_tokens=64, temperature=0.01, top_p=1)
	# Decode output to text
	print("inference end")
	response_new = tokenizer.decode(outputs[0], skip_special_tokens=True)
	history.append((input_message, response_new))
	return history

	with gr.Blocks() as iface:
	chatbot = gr.Chatbot()
	state = gr.State([])

	with gr.Row():
	with gr.Column(scale=20):
	msg = gr.Textbox(show_label=False, placeholder="Type a message...", container=False)
	with gr.Column(min_width=70, scale=1):
	submit = gr.Button("➤", variant="primary")
	with gr.Column(min_width=50, scale=1):
	audio_btn = gr.Button("🎤")
	with gr.Column(min_width=50, scale=1):
	file_btn = gr.Button("📎")

	audio = gr.Audio(sources=["microphone","upload"], type="filepath", visible=False)
	image = gr.Image(type="filepath", visible=False)

	def process_input(message, history, audio_file, image_file):

	history = chat(message, history, audio_file, image_file)
	return "", history

	submit.click(process_input, inputs=[msg, state, audio, image], outputs=[msg, chatbot])
	msg.submit(process_input, inputs=[msg, state, audio, image], outputs=[msg, chatbot])

	def toggle_audio(audio_visible):
	return gr.update(visible=not audio_visible)

	def toggle_image(image_visible):
	return gr.update(visible=not image_visible)

	audio_btn.click(toggle_audio, inputs=[audio], outputs=[audio])
	file_btn.click(toggle_image, inputs=[image], outputs=[image])

	iface.launch()