Spaces:

Filteroff
/

video_audio_analyzer

Running on T4

App Files Files Community

beweinreich commited on Jul 23

Commit

1727a7e

•

0 Parent(s):

first

Browse files

Files changed (9) hide show

.gitignore +5 -0
Dockerfile +16 -0
Procfile +1 -0
app.py +38 -0
audio_analyzer.py +79 -0
playground.py +118 -0
requirements.txt +8 -0
runtime.txt +1 -0
video_analyzer.py +114 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.DS_Store
+.env
+raw/*
+tmp/*
+__pycache__/*

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: python app.py

app.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from flask import Flask, request, jsonify
+from audio_analyzer import AudioAnalyzer
+from video_analyzer import VideoAnalyzer
+app = Flask(__name__)
+@app.route('/', methods=['GET'])
+def hello_world():
+    return jsonify({"message": "Hello, World!"})
+@app.route('/v1/analyze_audio', methods=['POST'])
+def analyze_audio():
+    data = request.get_json()
+    audio_url = data.get('audio_url')
+    if not audio_url:
+        return jsonify({"error": "audio_url is required"}), 400
+    analyzer = AudioAnalyzer(media_url=audio_url, media_type="audio")
+    traits = analyzer.retrieve_traits()
+    return jsonify(traits)
+@app.route('/v1/analyze_video', methods=['POST'])
+def analyze_video():
+    data = request.get_json()
+    video_url = data.get('video_url')
+    if not video_url:
+        return jsonify({"error": "video_url is required"}), 400
+    analyzer = VideoAnalyzer(video_url=video_url)
+    traits = analyzer.retrieve_traits()
+    return jsonify(traits)
+if __name__ == '__main__':
+    app.run(debug=True)

audio_analyzer.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import urllib.request
+import moviepy.editor as mp
+from openai import OpenAI
+from transformers import pipeline
+from dotenv import load_dotenv
+load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
+TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
+client = OpenAI(api_key=OPENAI_API_KEY)
+class AudioAnalyzer:
+    def __init__(self, media_path=None, media_url=None, media_type=None):
+        self.personality_labels = [
+             "Empathetic","Resilient","Optimistic","Pessimistic","Introverted","Extroverted","Curious","Creative","Analytical","Dependable","Impulsive","Adaptable","Meticulous","Assertive","Agreeable","Courageous","Cautious","Patient","Ambitious", "Generous"
+        ]
+        self.personality_pipeline = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+        self.media_path = media_path
+        self.media_url = media_url
+        self.media_type = media_type
+        if media_url is not None:
+            self.download_content()
+        if media_type == "video":
+            self.extract_audio_from_video(self.media_path, "./tmp/audio.mp3")
+            self.media_path = "./tmp/audio.mp3"
+    def download_content(self):
+        download_url = self.media_url
+        local_path = self.media_path
+        # Set default paths if not provided
+        if self.media_type == "video" and not self.media_path:
+            self.media_path = "./tmp/video.mp4"
+        if self.media_type == "audio" and not self.media_path:
+            self.media_path = "./tmp/audio.mp3"
+        password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
+        password_mgr.add_password(None, download_url, TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
+        handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
+        opener = urllib.request.build_opener(handler)
+        urllib.request.install_opener(opener)
+        urllib.request.urlretrieve(download_url, self.media_path)
+    def extract_audio_from_video(self, video_file, audio_file):
+        clip = mp.VideoFileClip(video_file)
+        clip.audio.write_audiofile(audio_file)
+    def transcribe_audio_to_text(self, audio_file):
+        with open(audio_file, "rb") as audio:
+            transcription = client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio
+            )
+            video_text = transcription.text.strip()
+        return video_text
+    def retrieve_traits(self):
+        # Ensure we have an audio path
+        if not self.media_path:
+            raise ValueError("Media path is not specified.")
+        print("Transcribing audio to text...")
+        transcript = self.transcribe_audio_to_text(self.media_path)
+        print("Transcription complete")
+        traits = []
+        print("Running through personality pipeline...")
+        result = self.personality_pipeline(transcript, candidate_labels=self.personality_labels)
+        top_traits = sorted(zip(result['labels'], result['scores']), key=lambda x: x[1], reverse=True)[:5]
+        traits = {label: score for label, score in top_traits}
+        print(traits)
+        return traits

playground.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import json
+import base64
+import requests
+from openai import OpenAI
+from dotenv import load_dotenv
+from moviepy.editor import VideoFileClip
+load_dotenv()
+audio_filename = "extracted_audio.wav"
+image_filename = "extracted_image.jpg"
+api_key = os.getenv("OPENAI_API_KEY")
+client = OpenAI(api_key=api_key)
+video = VideoFileClip("zach.mov")
+audio = video.audio
+audio.write_audiofile(audio_filename)
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+audio_file= open(audio_filename, "rb")
+# Extract an image halfway through the video
+halfway_time = video.duration / 2
+video.save_frame(image_filename, t=halfway_time)
+transcription = client.audio.transcriptions.create(
+  model="whisper-1",
+  file=audio_file
+)
+video_text = transcription.text.strip()
+# Analyze sentiment using GPT-4
+prompt = f"""Analyze the sentiment of the following text:\n\n{video_text}
+You should respond in json format, as an object with key `response` and value as a string.
+"""
+completion = client.chat.completions.create(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt}
+    ],
+    model="gpt-3.5-turbo-1106",
+    response_format={"type": "json_object"},
+)
+response = completion.choices[0].message.content
+result = json.loads(response)
+parsed = result['response']
+print(parsed)
+# Analyze sentiment using GPT-4
+prompt = f"""Analyze the personality traits of the speaker in the following text:\n\n{video_text}
+You should respond in json format, as an object with key `response` and value as an array of personality traits, like "funny", "happy", "sarcastic".
+"""
+completion = client.chat.completions.create(
+    messages=[
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": prompt}
+    ],
+    model="gpt-3.5-turbo-1106",
+    response_format={"type": "json_object"},
+)
+response = completion.choices[0].message.content
+result = json.loads(response)
+parsed = result['response']
+print(parsed)
+# Function to encode the image
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+# Getting the base64 string
+base64_image = encode_image(image_filename)
+headers = {
+  "Content-Type": "application/json",
+  "Authorization": f"Bearer {api_key}"
+}
+payload = {
+  "model": "gpt-4o",
+  "messages": [
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": "Describe the person in this image. Be detailed."
+        },
+        {
+          "type": "image_url",
+          "image_url": {
+            "url": f"data:image/jpeg;base64,{base64_image}"
+          }
+        }
+      ]
+    }
+  ],
+  "max_tokens": 300
+}
+response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
+json_data = response.json()
+parsed = json_data['choices'][0]['message']['content']
+print(parsed)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+flask==3.0.3
+moviepy==1.0.3
+transformers==4.42.4
+openai==1.34.0
+python-dotenv==1.0.1
+torch==2.3.1
+transformers==4.42.4
+uvicorn==0.30.3

runtime.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-3.12.4

video_analyzer.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import os
+import json
+import base64
+import requests
+from tqdm import tqdm
+from dotenv import load_dotenv
+from moviepy.editor import VideoFileClip
+import re
+import urllib.request
+load_dotenv()
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+TWILIO_ACCOUNT_SID = os.getenv("TWILIO_ACCOUNT_SID")
+TWILIO_AUTH_TOKEN = os.getenv("TWILIO_AUTH_TOKEN")
+class VideoAnalyzer:
+    def __init__(self, video_path=None, video_url=None, num_images=3):
+        self.video_path = video_path
+        self.video_url = video_url
+        self.api_key = OPENAI_API_KEY
+        self.num_images = num_images
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        if self.video_url and not self.video_path:
+            self.download_video()
+    def download_video(self):
+        self.video_path = "./tmp/video.mp4"
+        password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
+        password_mgr.add_password(None, self.video_url, TWILIO_ACCOUNT_SID, TWILIO_AUTH_TOKEN)
+        handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
+        opener = urllib.request.build_opener(handler)
+        urllib.request.install_opener(opener)
+        urllib.request.urlretrieve(self.video_url, self.video_path)
+    def extract_images(self):
+        video = VideoFileClip(self.video_path)
+        duration = video.duration
+        time_intervals = [i * (duration / (self.num_images + 1)) for i in range(1, self.num_images + 1)]
+        self.image_filenames = [f"./tmp/image_{i}.jpg" for i in range(self.num_images)]
+        for i, t in enumerate(time_intervals):
+            video.save_frame(self.image_filenames[i], t=t)
+    def encode_image(self, image_path):
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode('utf-8')
+    def analyze_images(self):
+        traits_list = []
+        for image_filename in tqdm(self.image_filenames, desc="Processing images"):
+            base64_image = self.encode_image(image_filename)
+            payload = {
+                "model": "gpt-4o",
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": "Describe the person in this image. Be detailed as it pertains to their physical and emotional state. Please return the response in a list of traits or characteristics in a comma-separated list."
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{base64_image}"
+                                }
+                            }
+                        ]
+                    }
+                ],
+                "max_tokens": 300
+            }
+            response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
+            response_data = response.json()
+            description = response_data['choices'][0]['message']['content']
+            traits = description.split(",")
+            traits_list.extend(traits)
+        return traits_list
+    def clean_trait(self, trait):
+        # Remove line breaks, leading/trailing whitespace, and unnecessary dashes
+        cleaned_trait = trait.replace('\n', '').strip()
+        cleaned_trait = re.sub(r'^-+', '', cleaned_trait).strip()
+        cleaned_trait = cleaned_trait.rstrip('.')
+        # If the cleaned trait has more than 4 words, it's likely a sentence, so let's remove it
+        if len(cleaned_trait.split()) > 4:
+            return None
+        return cleaned_trait
+    def retrieve_traits(self):
+        self.extract_images()
+        traits = self.analyze_images()
+        cleaned_traits = [self.clean_trait(trait) for trait in traits]
+        cleaned_traits = [trait for trait in cleaned_traits if trait]
+        common_traits = list(set(cleaned_traits))
+        return common_traits
+if __name__ == "__main__":
+    # Use either video_path or video_url
+    # video_path = "./raw/zach.mov"
+    video_url = "https://video.twilio.com/v1/Recordings/RT2c1baf50b6343802964c98e5a6f979e3/Media"
+    analyzer = VideoAnalyzer(video_url=video_url)
+    traits = analyzer.retrieve_traits()
+    print(traits)