lalalalalalalalalala's picture
Update constraint.py
9ea9a8c verified
raw
history blame
6.39 kB
SYS_PROMPT = ""
USER_PROMPT = """# CONTEXT #
You are a powerful video captioner.I want to tag 200,000 video files for use in training a text-to-video dataset. The purpose of the video tags is to train a text-to-video model. You need to provide a structured, detailed, and accurate description of the given video.
# OBJECTIVE #
Video Description Task Instructions
Video Content Description:
Detail and Accuracy: Provide a detailed and accurate description of the video content. Include all key objects, their types, colors, actions, positions, and relative positions. Describe the overall atmosphere.
Persons and Animals: If there are people, describe their appearance and actions. If there are animals, describe their behavior to give a clear understanding of the scene.
Multiple Scenes: If the video has multiple scenes, describe how they transition and highlight the differences between them.
Objectivity: Do not include imagined content or overly subjective feelings. Ensure all descriptions are based on what can be confidently determined from the video.
Grammar and Length: Use correct English grammar. Each descriptive sentence should be at least three sentences long.
Video Quality Evaluation:
Aesthetic Value: Evaluate the aesthetic value, including composition, color harmony, and overall visual effect. Score this aspect from 1 to 5 and explain your reasoning.
Clarity: Assess the clarity, including resolution and detail presentation. Score this aspect from 1 to 5 and explain your reasoning.
Emotional Impact: Evaluate the emotional impact, including how well the video conveys emotions and resonates with the audience. Score this aspect from 1 to 5 and explain your reasoning.
Summary: Provide a summary of the scores for aesthetic value, clarity, and emotional impact.
Film Perspective Analysis:
Shot Analysis: Analyze the type of shots used (close-up, medium, long shot, etc.).
Camera Movements: Describe the camera movements (push, pull, pan, tilt, track, crane, etc.).
Composition: Analyze the composition of the shots.
Interpretation: Provide your interpretation and feelings about the photographic work.
# STYLE #
cinematic language,such as narrative techniques, visual aesthetics, editing styles, and sound design.
# Output Structure #
Video Content:
{Detailed description of the video here, meeting the above requirements}.
Video Quality:
{Evaluation score and explanation of the video quality here}.
Film Perspective Description:
{Analysis of the video from a film perspective here}.
Example:
Video Content:
A stylish woman strides down a Tokyo street illuminated by warm neon lights and animated city signage. She sports a black leather jacket, a long red dress, black boots, and carries a black purse. Her look is completed with sunglasses and red lipstick. Her demeanor is confident and casual. The damp street reflects the vibrant lights, creating a mirror effect. The scene is bustling with numerous pedestrians.
Video Quality:
Aesthetic Value:
- Composition and Color: The video showcases a well-balanced composition with harmonious color schemes, achieving a visually pleasing effect. Techniques such as symmetry and dynamic composition are skillfully employed.
- Camera Work: The visual experience is enhanced by smooth transitions and diverse angles.
- Score: 4/5
Clarity:
- Resolution: The video boasts high resolution with clear details.
- Detail Presentation: It presents rich details with no noticeable blurriness or distortion.
- Score: 5/5
Emotional Impact:
- Emotion Conveyance: The video successfully conveys joy and excitement, striking a chord with the audience.
- Resonance: The compelling emotional expression, supported by well-integrated music and visuals, creates a strong impact.
- Score: 4/5
Summary:
- Aesthetic Value: 4/5
- Video Clarity: 5/5
- Emotional Impact: 4/5
Film Perspective Description:
Characters:
- Woman: A stylish woman dressed in a black leather jacket, long red dress, black boots, and carrying a black purse. She wears sunglasses and red lipstick.
Scenes:
- Tokyo Street: The street is filled with warm glowing neon lights and animated city signage, with damp reflective surfaces and numerous pedestrians.
Shot 1:
- The woman walks confidently and casually down the Tokyo street.
- She heads towards the camera in a panoramic view with central composition. The camera is at eye level and follows her with a handheld shot.
- Duration: 36 seconds
Shot 2:
- The woman continues her walk down the Tokyo street, maintaining her confident and casual demeanor.
- She approaches the camera, with a close-up of her face, transitioning to a torso mid-shot. The camera remains at eye level, following her with a handheld shot.
- Duration: 24 seconds
"""
SKIP = 2
TEMP = 0.3
TOP = 0.75
MAX_TOKEN = 512
API_CLASSES = {
'Azure': 'AzureAPI',
'Google': 'GoogleAPI',
'Anthropic': 'AnthropicAPI',
'OpenAI': 'OpenAIAPI'
}
PROVIDERS_CONFIG = {
'Azure': {
'model': ['GPT-4o', 'GPT-4v'],
'key_label': 'Azure API Key',
'endpoint_label': 'Azure Endpoint'
},
'Google': {
'model': ['Gemini-1.5-Flash', 'Gemini-1.5-Pro'],
'key_label': 'Google API Key',
'endpoint_label': 'Google API Endpoint'
},
'Anthropic': {
'model': ['Claude-3-Opus', 'Claude-3-Sonnet'],
'key_label': 'Anthropic API Key',
'endpoint_label': 'Anthropic Endpoint'
},
'OpenAI': {
'model': ['GPT-4o', 'GPT-4v'],
'key_label': 'OpenAI API Key',
'endpoint_label': 'OpenAI Endpoint'
}
}
GENERAL_CONFIG = {
'temp': {
'label': 'Temperature',
'default': 0.3,
'min': 0,
'max': 1,
'step': 0.1
},
'top_p': {
'label': 'Top-P',
'default': 0.75,
'min': 0,
'max': 1,
'step': 0.1
},
'max_tokens': {
'label': 'Max Tokens',
'default': 4096,
'min': 512,
'max': 4096,
'step': 1
},
'frame_format': {
'label': 'Frame Format',
'default': 'JPEG',
'choices': ['JPEG', 'PNG']
},
'frame_skip': {
'label': 'Frame Skip',
'default': 2,
'min': 2,
'max': 100,
'step': 1
},
'group_size': {
'label': 'Group Size',
'default': 10,
'min': 1,
'max': 100,
'step': 1
}
}