File size: 6,311 Bytes
6a24aec
 
 
 
 
 
48860c6
 
6a24aec
 
48860c6
6a24aec
 
 
 
 
 
 
 
 
 
 
 
 
48860c6
 
 
 
 
 
 
 
6a24aec
061e8b0
 
 
48860c6
 
 
 
061e8b0
 
705e089
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48860c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a24aec
48860c6
061e8b0
 
 
 
6a24aec
 
 
 
 
 
061e8b0
6a24aec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import base64
import json
import os

import anthropic
import gradio as gr
from openai import OpenAI


# Remember to put your API Key here
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# image1_url = "https://i.abcnewsfe.com/a/7d849ccc-e0fe-4416-959d-85889e338add/dune-1-ht-bb-231212_1702405287482_hpMain_16x9.jpeg"
image1_media_type = "image/jpeg"
# image1_data = base64.b64encode(httpx.get(image1_url).content).decode("utf-8")
#

SYSTEM_PROMPT = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains music genera, style, instrument, and all the other details needed. This prompt will be provided to musicgen model to generate a 15s audio clip.

Here are some descriptions from musicgen model:
The model was trained with descriptions from a stock music catalog, descriptions that will work best should include some level of detail on the instruments present, along with some intended use case (e.g. adding “perfect for a commercial” can somehow help).

Try to make the prompt simple and concise with only 1-2 sentences

only return dictionary, with two items `description` and `prompt`

for example
{
  "description": "A serene beach at sunset with gentle waves and a distant ship.",
  "prompt": "A calming instrumental with gentle guitar, soft piano, and ocean waves sound effects, perfect for a relaxing moment by the sea."
}
"""

SYSTEM_PROMPT_AUDIO = """You are an expert llm prompt engineer, you understand the structure of llms and facebook musicgen text to audio model. You will be provided with an image, and require to output a prompt for the musicgen model to capture the essense of the image. Try to do it step by step, evaluate and analyze the image thoroughly. After that, develop a prompt that contains the detail of what background sounds this image should have. This prompt will be provided to audiogen model to generate a 15s audio clip.
Try to make the prompt simple and concise with only 1-2 sentences

only return dictionary, with two items `description` and `prompt`
for example
{"description": "A serene beach scene at sunset with gentle waves lapping on the shore and a distant ship sailing on the water.",
 "prompt": "Gentle waves flowing on the beach at sunset, with a distant ship in the background."}
"""

PROMPT_IMPROVEMENT_GENERATE_PROMPT = """
You are an export llm prompt enginner, you will be helping the user to improve their prompts. here are some examples of good prompts
- "90s rock song with electric guitar and heavy drums"
- "An 80s driving pop song with heavy drums and synth pads in the background"
- "An energetic hip-hop music piece, with synth sounds and strong bass. There is a rhythmic hi-hat patten in the drums."
- "A grand orchestral arrangement with thunderous percussion, epic brass fanfares, and soaring strings, creating a cinematic atmosphere fit for a heroic battle."
- "Classic reggae track with an electronic guitar solo"

You will be provided with a prompt and you need to improve it. Make sure the prompt is simple and concise with only 1-2 sentences. The output should be in JSON format, with one item `prompt`
"""


def improve_prompt(prompt):
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1024,
        system=PROMPT_IMPROVEMENT_GENERATE_PROMPT,
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                ],
            }
        ],
    )
    message_object = json.loads(message.content[0].text)
    prompt = message_object["prompt"]
    return message_object, prompt

def generate_caption_gpt4(image_file, model_file):
    client = OpenAI()
    if model_file == "facebook/audiogen-medium":
        system_prompt = SYSTEM_PROMPT_AUDIO
    else:
        system_prompt = SYSTEM_PROMPT
    with open(image_file, "rb") as f:
        image_encoded = base64.b64encode(f.read()).decode("utf-8")
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
            "role": "user",
            "content": [
                {"type": "text", 
                 "text": system_prompt},
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image_encoded}",
                },
                },
            ],
            }
        ],
        max_tokens=300,
        )
    message = json.loads(response.choices[0].message.content)
    return message['description'], message['prompt']
    


def generate_caption_claude3(image_file, model_file, progress=gr.Progress()):
    if model_file == "facebook/audiogen-medium":
        system_prompt = SYSTEM_PROMPT_AUDIO
    else:
        system_prompt = SYSTEM_PROMPT
    with open(image_file, "rb") as f:
        image_encoded = base64.b64encode(f.read()).decode("utf-8")
    progress(0, desc="Starting image captioning...")
    message = client.messages.create(
        model="claude-3-opus-20240229",
        max_tokens=1024,
        system=system_prompt,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": image1_media_type,
                            "data": image_encoded,
                        },
                    },
                    {"type": "text", "text": "develop the prompt based on this image"},
                ],
            }
        ],
    )
    progress(100, desc="image captioning...Done!")
    # Parse the content string into a Python object
    message_object = json.loads(message.content[0].text)
    # Access the description and prompt from the message object
    description = message_object["description"]
    prompt = message_object["prompt"]
    print(description)
    print(prompt)
    return message_object, description, prompt