mustafoyev202 commited on
Commit
ee9d6a2
·
verified ·
1 Parent(s): 384313a

Upload 2 files

Browse files
Files changed (2) hide show
  1. main.py +51 -0
  2. utils.py +190 -0
main.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import zipfile
4
+ import tempfile
5
+ from multiprocessing import Pool, cpu_count
6
+ from utils import getting_usage_info_from_results, process_multiple_videos_from_results, wrapper_with_delay, set_torch_threads
7
+
8
+ set_torch_threads(safe_ratio=0.5)
9
+
10
+ def gradio_interface(zip_file):
11
+ """Handles Gradio input: unzip and process videos."""
12
+ with tempfile.TemporaryDirectory() as temp_dir:
13
+ with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
14
+ zip_ref.extractall(temp_dir)
15
+
16
+ video_paths = [
17
+ os.path.join(temp_dir, f) for f in os.listdir(temp_dir) if f.endswith(".mp4")
18
+ ]
19
+
20
+ if not video_paths:
21
+ raise ValueError("No .mp4 video files found in the zip archive.")
22
+
23
+ # First processing: returns results from analyze_single_video
24
+ results = []
25
+ with Pool(min(cpu_count(), len(video_paths))) as pool:
26
+ results = pool.map(wrapper_with_delay, video_paths)
27
+
28
+ df_result = process_multiple_videos_from_results(results)
29
+ df_info = getting_usage_info_from_results(video_paths, results)
30
+
31
+ csv_result = "emotion_results.csv"
32
+ csv_info = "usage_info.csv"
33
+
34
+ df_result.to_csv(csv_result, index=False)
35
+ df_info.to_csv(csv_info, index=False)
36
+
37
+ return df_result, df_info, csv_result, csv_info
38
+
39
+ # Gradio interface
40
+ iface = gr.Interface(
41
+ fn=gradio_interface,
42
+ inputs=gr.File(file_types=[".zip"], label="Upload a ZIP of videos"),
43
+ outputs=[gr.DataFrame(label="Emotion Analysis"),
44
+ gr.DataFrame(label="Token Usage Estimation"),
45
+ gr.File(label="Download Result CSV"),
46
+ gr.File(label="Download Usage Info CSV")],
47
+ title="Batch Video Emotion Analyzer (ZIP Upload)",
48
+ description="Upload a .zip file containing .mp4 videos. The app will extract and analyze the emotions in parallel."
49
+ )
50
+
51
+ iface.launch(share=True)
utils.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from moviepy.editor import VideoFileClip, concatenate_videoclips
2
+ from pydub import AudioSegment
3
+ import numpy as np
4
+ import torch
5
+ from silero_vad import load_silero_vad, get_speech_timestamps
6
+ import os
7
+ import json
8
+ from google import genai
9
+ import pandas as pd
10
+ import re
11
+ import time
12
+ from dotenv import load_dotenv
13
+
14
+ torch.set_num_threads(1)
15
+
16
+
17
+ load_dotenv()
18
+ client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
19
+
20
+ def set_torch_threads(safe_ratio=0.5):
21
+ try:
22
+ total_cores = os.cpu_count()
23
+ optimal_threads = max(1, int(total_cores * safe_ratio))
24
+ torch.set_num_threads(optimal_threads)
25
+ print(f"Set torch threads to: {optimal_threads} (out of {total_cores} cores)")
26
+ except Exception as e:
27
+ print(f"Failed to set torch threads dynamically: {e}")
28
+ torch.set_num_threads(1)
29
+
30
+ def analyze_single_video(video_path):
31
+ """Analyzes a single video for emotions using the GenAI model."""
32
+ prompt = """
33
+ Detect emotion from this video and classify into 3 categories: happy, sad, normal. Return only JSON format without any extra text.
34
+
35
+ Return this JSON schema:
36
+
37
+ {
38
+ "Vocal": {
39
+ "sad_score": (%),
40
+ "happy_score": (%),
41
+ "normal_score": (%),
42
+ "sad_reason": (list of timestamps),
43
+ "happy_reason": (list of timestamps),
44
+ "normal_reason": (list of timestamps)
45
+ },
46
+ "Verbal": {
47
+ "sad_score": (%),
48
+ "happy_score": (%),
49
+ "normal_score": (%),
50
+ "sad_reason": (list of timestamps),
51
+ "happy_reason": (list of timestamps),
52
+ "normal_reason": (list of timestamps)
53
+ },
54
+ "Vision": {
55
+ "sad_score": (%),
56
+ "happy_score": (%),
57
+ "normal_score": (%),
58
+ "sad_reason": (list of timestamps),
59
+ "happy_reason": (list of timestamps),
60
+ "normal_reason": (list of timestamps)
61
+ }
62
+ }
63
+
64
+ Reasons (sad_reason, happy_reason, normal_reason) should be a list of beginning-ending timestamps. For example: ['0:11-0:14', '0:23-0:25', '0:27-0:29']
65
+ """
66
+
67
+ try:
68
+ with open(video_path, 'rb') as video_file:
69
+ video_bytes = video_file.read()
70
+
71
+ print(f"Processing: {video_path}")
72
+
73
+ response = client.models.generate_content(
74
+ model="gemini-2.0-flash",
75
+ contents=[{"text": prompt}, {"inline_data": {"data": video_bytes, "mime_type": "video/mp4"}}],
76
+ config={"http_options": {"timeout": 60000}}
77
+ )
78
+
79
+ # Extract token usage information
80
+ input_token = response.usage_metadata.prompt_token_count
81
+ output_token = response.usage_metadata.candidates_token_count
82
+ total_token = response.usage_metadata.total_token_count
83
+
84
+ response_text = response.text.strip()
85
+ json_match = re.search(r'```json\s*([\s\S]*?)\s*```', response_text)
86
+ json_string = json_match.group(1).strip() if json_match else response_text
87
+ result = json.loads(json_string)
88
+
89
+ return (video_path, result, input_token, output_token, total_token)
90
+
91
+ except Exception as e:
92
+ print(f"Error processing {video_path}: {e}")
93
+ return (video_path, None, 0, 0, 0)
94
+
95
+ def wrapper_with_delay(video_path):
96
+ time.sleep(2) # Add delay to avoid throttling
97
+ return analyze_single_video(video_path)
98
+
99
+
100
+ def process_multiple_videos_from_results(results):
101
+ """Processes results directly without re-analyzing."""
102
+ records = []
103
+
104
+ for video_path, result, _, _, _ in results:
105
+ if result is None:
106
+ continue
107
+
108
+ video_title = os.path.basename(video_path)
109
+
110
+ for category in ['Verbal', 'Vocal', 'Vision']:
111
+ for emotion in ['normal', 'happy', 'sad']:
112
+ score = result[category].get(f"{emotion}_score", 0)
113
+ reasons = result[category].get(f"{emotion}_reason", [])
114
+ records.append({
115
+ 'title': video_title,
116
+ 'category': category,
117
+ 'emotion': emotion,
118
+ 'score': score,
119
+ 'reasons': json.dumps(reasons)
120
+ })
121
+
122
+ df = pd.DataFrame(records)
123
+ return df
124
+
125
+ def getting_video_length(vid):
126
+ clip = VideoFileClip(vid)
127
+ duration = clip.duration
128
+ return np.round(duration, decimals=2)
129
+
130
+ def get_speech_only_video_duration(video_path: str, sampling_rate: int = 16000, use_onnx: bool = False) -> float:
131
+ # Load VAD model
132
+ model = load_silero_vad(onnx=use_onnx)
133
+
134
+ # Extract audio from video using pydub
135
+ audio = AudioSegment.from_file(video_path).set_frame_rate(sampling_rate).set_channels(1)
136
+ samples = np.array(audio.get_array_of_samples()).astype("float32") / (2**15)
137
+ audio_tensor = torch.from_numpy(samples)
138
+
139
+ # Get speech timestamps
140
+ speech_timestamps = get_speech_timestamps(audio_tensor, model, sampling_rate=sampling_rate)
141
+
142
+ # Convert sample indices to seconds
143
+ for ts in speech_timestamps:
144
+ ts['start'] /= sampling_rate
145
+ ts['end'] /= sampling_rate
146
+
147
+ if not speech_timestamps:
148
+ return 0.0 # No speech detected
149
+
150
+ # Load video
151
+ video = VideoFileClip(video_path)
152
+
153
+ # Extract speech-only clips
154
+ clips = [video.subclip(ts['start'], ts['end']) for ts in speech_timestamps]
155
+
156
+ # Concatenate and return duration
157
+ final_video = concatenate_videoclips(clips)
158
+ return final_video.duration
159
+
160
+
161
+ def getting_usage_info_from_results(video_paths, results):
162
+ """Use pre-fetched results to avoid double processing."""
163
+ filenames = np.vectorize(os.path.basename)(video_paths).reshape(-1, 1)
164
+ durations = np.vectorize(getting_video_length)(video_paths).reshape(-1, 1)
165
+ speech_durations = np.vectorize(get_speech_only_video_duration)(video_paths).reshape(-1, 1)
166
+
167
+ token_data = np.array([[r[2], r[3], r[4]] for r in results if r[1] is not None])
168
+ if token_data.size == 0:
169
+ token_data = np.zeros((len(video_paths), 3))
170
+
171
+ token_data = token_data.astype(float)
172
+
173
+ X = 1_000_000
174
+ input_token_price = np.round(token_data[:, 0] * 0.10 / X, decimals=4).reshape(-1, 1)
175
+ output_token_price = np.round(token_data[:, 1] * 0.40 / X, decimals=4).reshape(-1, 1)
176
+ total_token_price = input_token_price + output_token_price
177
+
178
+ final_arr = np.concatenate(
179
+ (filenames, durations, speech_durations, token_data, input_token_price, output_token_price, total_token_price),
180
+ axis=1
181
+ )
182
+
183
+ df = pd.DataFrame(
184
+ final_arr,
185
+ columns=[
186
+ 'title', 'total_duration(s)', 'speech_duration(s)', 'input_token', 'output_token', 'total_token',
187
+ 'input_price($)', 'output_price($)', 'total_price($)'
188
+ ]
189
+ )
190
+ return df