import base64 from mimetypes import guess_type from openai import AzureOpenAI import os # Function to encode a local image into a data URL def local_image_to_data_url(image_path): # Guess the MIME type of the image based on the file extension mime_type, _ = guess_type(image_path) # If MIME type is not found or the file is .webp, set it explicitly if mime_type is None or mime_type == 'application/octet-stream': if image_path.lower().endswith('.webp'): mime_type = 'image/webp' # Explicitly set for .webp images else: mime_type = 'application/octet-stream' # Default MIME type if none is found # Read and encode the image file with open(image_path, "rb") as image_file: base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8') # Construct the data URL return f"data:{mime_type};base64,{base64_encoded_data}" # Images Path images_path = "/eph/nvme0/azureml/cr/j/8569d5e3aa08485780b67a53d671e109/exe/wd/1_2M_Dataset" # Images list imgs_list = [file for file in os.listdir(images_path)] # Azure - OpenAI Credential api_base = "https://allam-swn-gpt-01.openai.azure.com/" # your endpoint should look like the following https://YOUR_RESOURCE_NAME.openai.azure.com/ api_key="8af2cca79fb34601ab829b44b7fa6dcf" deployment_name = "gpt-4o-900ptu" api_version = "2024-02-15-preview" # this might change in the future # Define a client client = AzureOpenAI( api_key=api_key, api_version=api_version, base_url=f"{api_base}openai/deployments/{deployment_name}", ) # Iterate over all images for img_name in imgs_list: # Get image path img_path = os.path.join(images_path, img_name) # Get txt file txt_file_name = img_name.split(".")[0] + ".txt" txt_path = os.path.join(images_path, txt_file_name) # Make the local image to a url link to be accepted by the model data_url = local_image_to_data_url(img_path) response = client.chat.completions.create( model=deployment_name, messages=[ { "role": "system", "content": "You are an image captioning assistant." }, { "role": "user", "content": [ { "type": "text", "text": """You are my captioning model, I will give you a punch of images with their main subject, and I want you to write a detailed caption based on what you see in the images alone. Take these consideration when writing the caption: Order the terms in the caption and use commas. The order of the words in the caption directly corresponds to their weight when generating the final image, so a main subject should always be at the start of the prompt. If we want to add more details, do it in a "narrative style" and using commas to help separate the terms for the FLUX model to read. The tag of this image is: 1/2M cup""" }, { "type": "image_url", "image_url": { "url": data_url } } ] } ], max_tokens=2000 ) with open(txt_path, "w") as f: f.write(response.choices[0].message.content)