Spaces:
Running
on
T4
Running
on
T4
beweinreich
commited on
Commit
•
1727a7e
0
Parent(s):
first
Browse files- .gitignore +5 -0
- Dockerfile +16 -0
- Procfile +1 -0
- app.py +38 -0
- audio_analyzer.py +79 -0
- playground.py +118 -0
- requirements.txt +8 -0
- runtime.txt +1 -0
- video_analyzer.py +114 -0
.gitignore
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_Store
|
2 |
+
.env
|
3 |
+
raw/*
|
4 |
+
tmp/*
|
5 |
+
__pycache__/*
|
Dockerfile
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
+
# you will also find guides on how best to write your Dockerfile
|
3 |
+
|
4 |
+
FROM python:3.9
|
5 |
+
|
6 |
+
RUN useradd -m -u 1000 user
|
7 |
+
USER user
|
8 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
9 |
+
|
10 |
+
WORKDIR /app
|
11 |
+
|
12 |
+
COPY --chown=user ./requirements.txt requirements.txt
|
13 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
14 |
+
|
15 |
+
COPY --chown=user . /app
|
16 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
Procfile
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
web: python app.py
|
app.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, request, jsonify
|
2 |
+
from audio_analyzer import AudioAnalyzer
|
3 |
+
from video_analyzer import VideoAnalyzer
|
4 |
+
|
5 |
+
app = Flask(__name__)
|
6 |
+
|
7 |
+
@app.route('/', methods=['GET'])
|
8 |
+
def hello_world():
|
9 |
+
return jsonify({"message": "Hello, World!"})
|
10 |
+
|
11 |
+
@app.route('/v1/analyze_audio', methods=['POST'])
|
12 |
+
def analyze_audio():
|
13 |
+
data = request.get_json()
|
14 |
+
audio_url = data.get('audio_url')
|
15 |
+
|
16 |
+
if not audio_url:
|
17 |
+
return jsonify({"error": "audio_url is required"}), 400
|
18 |
+
|
19 |
+
analyzer = AudioAnalyzer(media_url=audio_url, media_type="audio")
|
20 |
+
traits = analyzer.retrieve_traits()
|
21 |
+
|
22 |
+
return jsonify(traits)
|
23 |
+
|
24 |
+
@app.route('/v1/analyze_video', methods=['POST'])
|
25 |
+
def analyze_video():
|
26 |
+
data = request.get_json()
|
27 |
+
video_url = data.get('video_url')
|
28 |
+
|
29 |
+
if not video_url:
|
30 |
+
return jsonify({"error": "video_url is required"}), 400
|
31 |
+
|
32 |
+
analyzer = VideoAnalyzer(video_url=video_url)
|
33 |
+
traits = analyzer.retrieve_traits()
|
34 |
+
|
35 |
+
return jsonify(traits)
|
36 |
+
|
37 |
+
if __name__ == '__main__':
|
38 |
+
app.run(debug=True)
|
audio_analyzer.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import urllib.request
|
3 |
+
import moviepy.editor as mp
|
4 |
+
from openai import OpenAI
|
5 |
+
from transformers import pipeline
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
|
8 |
+
load_dotenv()
|
9 |
+
|
10 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
11 |
+
TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
|
12 |
+
TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
|
13 |
+
client = OpenAI(api_key=OPENAI_API_KEY)
|
14 |
+
|
15 |
+
class AudioAnalyzer:
|
16 |
+
def __init__(self, media_path=None, media_url=None, media_type=None):
|
17 |
+
self.personality_labels = [
|
18 |
+
"Empathetic","Resilient","Optimistic","Pessimistic","Introverted","Extroverted","Curious","Creative","Analytical","Dependable","Impulsive","Adaptable","Meticulous","Assertive","Agreeable","Courageous","Cautious","Patient","Ambitious", "Generous"
|
19 |
+
]
|
20 |
+
self.personality_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
|
21 |
+
self.media_path = media_path
|
22 |
+
self.media_url = media_url
|
23 |
+
self.media_type = media_type
|
24 |
+
|
25 |
+
if media_url is not None:
|
26 |
+
self.download_content()
|
27 |
+
|
28 |
+
if media_type == "video":
|
29 |
+
self.extract_audio_from_video(self.media_path, "./tmp/audio.mp3")
|
30 |
+
self.media_path = "./tmp/audio.mp3"
|
31 |
+
|
32 |
+
def download_content(self):
|
33 |
+
download_url = self.media_url
|
34 |
+
local_path = self.media_path
|
35 |
+
|
36 |
+
# Set default paths if not provided
|
37 |
+
if self.media_type == "video" and not self.media_path:
|
38 |
+
self.media_path = "./tmp/video.mp4"
|
39 |
+
if self.media_type == "audio" and not self.media_path:
|
40 |
+
self.media_path = "./tmp/audio.mp3"
|
41 |
+
|
42 |
+
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
|
43 |
+
password_mgr.add_password(None, download_url, TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
|
44 |
+
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
|
45 |
+
opener = urllib.request.build_opener(handler)
|
46 |
+
urllib.request.install_opener(opener)
|
47 |
+
urllib.request.urlretrieve(download_url, self.media_path)
|
48 |
+
|
49 |
+
def extract_audio_from_video(self, video_file, audio_file):
|
50 |
+
clip = mp.VideoFileClip(video_file)
|
51 |
+
clip.audio.write_audiofile(audio_file)
|
52 |
+
|
53 |
+
def transcribe_audio_to_text(self, audio_file):
|
54 |
+
with open(audio_file, "rb") as audio:
|
55 |
+
transcription = client.audio.transcriptions.create(
|
56 |
+
model="whisper-1",
|
57 |
+
file=audio
|
58 |
+
)
|
59 |
+
video_text = transcription.text.strip()
|
60 |
+
return video_text
|
61 |
+
|
62 |
+
def retrieve_traits(self):
|
63 |
+
# Ensure we have an audio path
|
64 |
+
if not self.media_path:
|
65 |
+
raise ValueError("Media path is not specified.")
|
66 |
+
|
67 |
+
print("Transcribing audio to text...")
|
68 |
+
transcript = self.transcribe_audio_to_text(self.media_path)
|
69 |
+
print("Transcription complete")
|
70 |
+
|
71 |
+
traits = []
|
72 |
+
print("Running through personality pipeline...")
|
73 |
+
result = self.personality_pipeline(transcript, candidate_labels=self.personality_labels)
|
74 |
+
|
75 |
+
top_traits = sorted(zip(result['labels'], result['scores']), key=lambda x: x[1], reverse=True)[:5]
|
76 |
+
traits = {label: score for label, score in top_traits}
|
77 |
+
|
78 |
+
print(traits)
|
79 |
+
return traits
|
playground.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import base64
|
4 |
+
import requests
|
5 |
+
from openai import OpenAI
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from moviepy.editor import VideoFileClip
|
8 |
+
|
9 |
+
load_dotenv()
|
10 |
+
|
11 |
+
audio_filename = "extracted_audio.wav"
|
12 |
+
image_filename = "extracted_image.jpg"
|
13 |
+
|
14 |
+
api_key = os.getenv("OPENAI_API_KEY")
|
15 |
+
client = OpenAI(api_key=api_key)
|
16 |
+
|
17 |
+
video = VideoFileClip("zach.mov")
|
18 |
+
audio = video.audio
|
19 |
+
audio.write_audiofile(audio_filename)
|
20 |
+
|
21 |
+
def encode_image(image_path):
|
22 |
+
with open(image_path, "rb") as image_file:
|
23 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
24 |
+
|
25 |
+
audio_file= open(audio_filename, "rb")
|
26 |
+
|
27 |
+
# Extract an image halfway through the video
|
28 |
+
halfway_time = video.duration / 2
|
29 |
+
video.save_frame(image_filename, t=halfway_time)
|
30 |
+
|
31 |
+
transcription = client.audio.transcriptions.create(
|
32 |
+
model="whisper-1",
|
33 |
+
file=audio_file
|
34 |
+
)
|
35 |
+
video_text = transcription.text.strip()
|
36 |
+
|
37 |
+
|
38 |
+
# Analyze sentiment using GPT-4
|
39 |
+
prompt = f"""Analyze the sentiment of the following text:\n\n{video_text}
|
40 |
+
|
41 |
+
You should respond in json format, as an object with key `response` and value as a string.
|
42 |
+
"""
|
43 |
+
completion = client.chat.completions.create(
|
44 |
+
messages=[
|
45 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
46 |
+
{"role": "user", "content": prompt}
|
47 |
+
],
|
48 |
+
model="gpt-3.5-turbo-1106",
|
49 |
+
response_format={"type": "json_object"},
|
50 |
+
)
|
51 |
+
response = completion.choices[0].message.content
|
52 |
+
result = json.loads(response)
|
53 |
+
parsed = result['response']
|
54 |
+
print(parsed)
|
55 |
+
|
56 |
+
|
57 |
+
# Analyze sentiment using GPT-4
|
58 |
+
prompt = f"""Analyze the personality traits of the speaker in the following text:\n\n{video_text}
|
59 |
+
|
60 |
+
You should respond in json format, as an object with key `response` and value as an array of personality traits, like "funny", "happy", "sarcastic".
|
61 |
+
"""
|
62 |
+
completion = client.chat.completions.create(
|
63 |
+
messages=[
|
64 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
65 |
+
{"role": "user", "content": prompt}
|
66 |
+
],
|
67 |
+
model="gpt-3.5-turbo-1106",
|
68 |
+
response_format={"type": "json_object"},
|
69 |
+
)
|
70 |
+
response = completion.choices[0].message.content
|
71 |
+
result = json.loads(response)
|
72 |
+
parsed = result['response']
|
73 |
+
print(parsed)
|
74 |
+
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
# Function to encode the image
|
79 |
+
def encode_image(image_path):
|
80 |
+
with open(image_path, "rb") as image_file:
|
81 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
82 |
+
|
83 |
+
|
84 |
+
# Getting the base64 string
|
85 |
+
base64_image = encode_image(image_filename)
|
86 |
+
|
87 |
+
headers = {
|
88 |
+
"Content-Type": "application/json",
|
89 |
+
"Authorization": f"Bearer {api_key}"
|
90 |
+
}
|
91 |
+
|
92 |
+
payload = {
|
93 |
+
"model": "gpt-4o",
|
94 |
+
"messages": [
|
95 |
+
{
|
96 |
+
"role": "user",
|
97 |
+
"content": [
|
98 |
+
{
|
99 |
+
"type": "text",
|
100 |
+
"text": "Describe the person in this image. Be detailed."
|
101 |
+
},
|
102 |
+
{
|
103 |
+
"type": "image_url",
|
104 |
+
"image_url": {
|
105 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
106 |
+
}
|
107 |
+
}
|
108 |
+
]
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"max_tokens": 300
|
112 |
+
}
|
113 |
+
|
114 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
115 |
+
|
116 |
+
json_data = response.json()
|
117 |
+
parsed = json_data['choices'][0]['message']['content']
|
118 |
+
print(parsed)
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
flask==3.0.3
|
2 |
+
moviepy==1.0.3
|
3 |
+
transformers==4.42.4
|
4 |
+
openai==1.34.0
|
5 |
+
python-dotenv==1.0.1
|
6 |
+
torch==2.3.1
|
7 |
+
transformers==4.42.4
|
8 |
+
uvicorn==0.30.3
|
runtime.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
python-3.12.4
|
video_analyzer.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import base64
|
4 |
+
import requests
|
5 |
+
from tqdm import tqdm
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from moviepy.editor import VideoFileClip
|
8 |
+
import re
|
9 |
+
import urllib.request
|
10 |
+
|
11 |
+
load_dotenv()
|
12 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
13 |
+
TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
|
14 |
+
TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
|
15 |
+
|
16 |
+
class VideoAnalyzer:
|
17 |
+
def __init__(self, video_path=None, video_url=None, num_images=3):
|
18 |
+
self.video_path = video_path
|
19 |
+
self.video_url = video_url
|
20 |
+
self.api_key = OPENAI_API_KEY
|
21 |
+
self.num_images = num_images
|
22 |
+
self.headers = {
|
23 |
+
"Content-Type": "application/json",
|
24 |
+
"Authorization": f"Bearer {self.api_key}"
|
25 |
+
}
|
26 |
+
|
27 |
+
if self.video_url and not self.video_path:
|
28 |
+
self.download_video()
|
29 |
+
|
30 |
+
def download_video(self):
|
31 |
+
self.video_path = "./tmp/video.mp4"
|
32 |
+
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
|
33 |
+
password_mgr.add_password(None, self.video_url, TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
|
34 |
+
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
|
35 |
+
opener = urllib.request.build_opener(handler)
|
36 |
+
urllib.request.install_opener(opener)
|
37 |
+
urllib.request.urlretrieve(self.video_url, self.video_path)
|
38 |
+
|
39 |
+
def extract_images(self):
|
40 |
+
video = VideoFileClip(self.video_path)
|
41 |
+
duration = video.duration
|
42 |
+
time_intervals = [i * (duration / (self.num_images + 1)) for i in range(1, self.num_images + 1)]
|
43 |
+
self.image_filenames = [f"./tmp/image_{i}.jpg" for i in range(self.num_images)]
|
44 |
+
|
45 |
+
for i, t in enumerate(time_intervals):
|
46 |
+
video.save_frame(self.image_filenames[i], t=t)
|
47 |
+
|
48 |
+
def encode_image(self, image_path):
|
49 |
+
with open(image_path, "rb") as image_file:
|
50 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
51 |
+
|
52 |
+
def analyze_images(self):
|
53 |
+
traits_list = []
|
54 |
+
|
55 |
+
for image_filename in tqdm(self.image_filenames, desc="Processing images"):
|
56 |
+
base64_image = self.encode_image(image_filename)
|
57 |
+
payload = {
|
58 |
+
"model": "gpt-4o",
|
59 |
+
"messages": [
|
60 |
+
{
|
61 |
+
"role": "user",
|
62 |
+
"content": [
|
63 |
+
{
|
64 |
+
"type": "text",
|
65 |
+
"text": "Describe the person in this image. Be detailed as it pertains to their physical and emotional state. Please return the response in a list of traits or characteristics in a comma-separated list."
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"type": "image_url",
|
69 |
+
"image_url": {
|
70 |
+
"url": f"data:image/jpeg;base64,{base64_image}"
|
71 |
+
}
|
72 |
+
}
|
73 |
+
]
|
74 |
+
}
|
75 |
+
],
|
76 |
+
"max_tokens": 300
|
77 |
+
}
|
78 |
+
|
79 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
|
80 |
+
response_data = response.json()
|
81 |
+
description = response_data['choices'][0]['message']['content']
|
82 |
+
traits = description.split(",")
|
83 |
+
traits_list.extend(traits)
|
84 |
+
|
85 |
+
return traits_list
|
86 |
+
|
87 |
+
def clean_trait(self, trait):
|
88 |
+
# Remove line breaks, leading/trailing whitespace, and unnecessary dashes
|
89 |
+
cleaned_trait = trait.replace('\n', '').strip()
|
90 |
+
cleaned_trait = re.sub(r'^-+', '', cleaned_trait).strip()
|
91 |
+
cleaned_trait = cleaned_trait.rstrip('.')
|
92 |
+
|
93 |
+
# If the cleaned trait has more than 4 words, it's likely a sentence, so let's remove it
|
94 |
+
if len(cleaned_trait.split()) > 4:
|
95 |
+
return None
|
96 |
+
|
97 |
+
return cleaned_trait
|
98 |
+
|
99 |
+
def retrieve_traits(self):
|
100 |
+
self.extract_images()
|
101 |
+
traits = self.analyze_images()
|
102 |
+
|
103 |
+
cleaned_traits = [self.clean_trait(trait) for trait in traits]
|
104 |
+
cleaned_traits = [trait for trait in cleaned_traits if trait]
|
105 |
+
common_traits = list(set(cleaned_traits))
|
106 |
+
return common_traits
|
107 |
+
|
108 |
+
if __name__ == "__main__":
|
109 |
+
# Use either video_path or video_url
|
110 |
+
# video_path = "./raw/zach.mov"
|
111 |
+
video_url = "https://video.twilio.com/v1/Recordings/RT2c1baf50b6343802964c98e5a6f979e3/Media"
|
112 |
+
analyzer = VideoAnalyzer(video_url=video_url)
|
113 |
+
traits = analyzer.retrieve_traits()
|
114 |
+
print(traits)
|