Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -5,43 +5,69 @@ import subprocess
|
|
5 |
from transformers import pipeline
|
6 |
from concurrent.futures import ThreadPoolExecutor
|
7 |
import re
|
|
|
|
|
|
|
8 |
|
9 |
class VideoProcessor:
|
10 |
def __init__(self):
|
11 |
-
self.summarizer = pipeline("summarization",
|
12 |
-
|
13 |
-
self.
|
14 |
-
|
15 |
def load_model(self, model_size="base"):
|
16 |
if model_size not in self.models:
|
17 |
self.models[model_size] = whisper.load_model(model_size)
|
18 |
return self.models[model_size]
|
19 |
|
20 |
-
def
|
21 |
-
"""
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
try:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
"--quiet",
|
32 |
-
"-o", audio_path,
|
33 |
-
youtube_url
|
34 |
-
], check=True, capture_output=True)
|
35 |
-
|
36 |
-
# Find the actual downloaded file
|
37 |
-
for f in os.listdir(temp_dir):
|
38 |
-
if f.endswith('.mp3'):
|
39 |
-
return os.path.join(temp_dir, f)
|
40 |
-
|
41 |
-
raise Exception("Audio file not found after download")
|
42 |
-
|
43 |
-
except subprocess.CalledProcessError as e:
|
44 |
-
raise Exception(f"Failed to download video: {e.stderr.decode()}")
|
45 |
|
46 |
def transcribe_audio(self, audio_path, model_size="base"):
|
47 |
model = self.load_model(model_size)
|
@@ -49,54 +75,60 @@ class VideoProcessor:
|
|
49 |
return result["text"]
|
50 |
|
51 |
def clean_transcript(self, text):
|
52 |
-
"""Remove filler words and repetitive phrases"""
|
53 |
text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
|
54 |
return re.sub(r'\s+', ' ', text).strip()
|
55 |
|
56 |
def summarize_chunk(self, chunk):
|
57 |
-
return self.summarizer(chunk,
|
58 |
-
max_length=150,
|
59 |
-
min_length=30,
|
60 |
-
do_sample=False)[0]['summary_text']
|
61 |
|
62 |
def summarize_text(self, text, chunk_size=1000):
|
63 |
-
"""Parallelized summarization"""
|
64 |
text = self.clean_transcript(text)
|
65 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
66 |
|
67 |
-
with ThreadPoolExecutor() as executor:
|
68 |
summaries = list(executor.map(self.summarize_chunk, chunks))
|
69 |
|
70 |
return "\n".join(summaries)
|
71 |
|
72 |
def extract_key_points(self, text):
|
73 |
-
"""
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
Be specific and include numbers/dates when mentioned.
|
78 |
|
79 |
Transcript:
|
80 |
-
{text[:
|
81 |
|
82 |
-
Key Points:
|
83 |
-
"""
|
84 |
|
85 |
-
result = self.summarizer(prompt,
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
# Post-process to ensure bullet points
|
91 |
-
return re.sub(r'(^|\n)(?=\w)', '\n- ', result[0]['summary_text'])
|
92 |
|
93 |
-
def process(self, youtube_url, chunk_size=1000, model_size="base"):
|
94 |
-
|
95 |
-
|
96 |
-
transcript = self.transcribe_audio(audio_path, model_size)
|
97 |
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
from transformers import pipeline
|
6 |
from concurrent.futures import ThreadPoolExecutor
|
7 |
import re
|
8 |
+
import json
|
9 |
+
from hashlib import md5
|
10 |
+
import browser_cookie3
|
11 |
|
12 |
class VideoProcessor:
|
13 |
def __init__(self):
|
14 |
+
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
15 |
+
self.models = {}
|
16 |
+
self.cookie_file = "cookies.txt" # Path to your cookies file
|
17 |
+
|
18 |
def load_model(self, model_size="base"):
|
19 |
if model_size not in self.models:
|
20 |
self.models[model_size] = whisper.load_model(model_size)
|
21 |
return self.models[model_size]
|
22 |
|
23 |
+
def _download_with_cookies(self, url):
|
24 |
+
"""Method 1: Download using browser cookies"""
|
25 |
+
cmd = [
|
26 |
+
"yt-dlp",
|
27 |
+
"--cookies", self.cookie_file,
|
28 |
+
"--extract-audio",
|
29 |
+
"--audio-format", "mp3",
|
30 |
+
"--audio-quality", "0",
|
31 |
+
"--quiet",
|
32 |
+
"-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
|
33 |
+
url
|
34 |
+
]
|
35 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
36 |
+
if result.returncode != 0:
|
37 |
+
raise Exception(f"Cookie download failed: {result.stderr}")
|
38 |
+
return self._find_downloaded_file()
|
39 |
+
|
40 |
+
def _download_with_yt_dlp(self, url):
|
41 |
+
"""Method 2: Regular download"""
|
42 |
+
cmd = [
|
43 |
+
"yt-dlp",
|
44 |
+
"--extract-audio",
|
45 |
+
"--audio-format", "mp3",
|
46 |
+
"--quiet",
|
47 |
+
"-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
|
48 |
+
url
|
49 |
+
]
|
50 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
51 |
+
if result.returncode != 0:
|
52 |
+
raise Exception(f"Download failed: {result.stderr}")
|
53 |
+
return self._find_downloaded_file()
|
54 |
+
|
55 |
+
def _find_downloaded_file(self):
|
56 |
+
"""Helper to find downloaded audio file"""
|
57 |
+
for root, _, files in os.walk(tempfile.gettempdir()):
|
58 |
+
for file in files:
|
59 |
+
if file.endswith('.mp3'):
|
60 |
+
return os.path.join(root, file)
|
61 |
+
raise Exception("Downloaded audio file not found")
|
62 |
+
|
63 |
+
def download_audio(self, url, use_cookies=False):
|
64 |
+
"""Robust download with fallback methods"""
|
65 |
try:
|
66 |
+
if use_cookies and os.path.exists(self.cookie_file):
|
67 |
+
return self._download_with_cookies(url)
|
68 |
+
return self._download_with_yt_dlp(url)
|
69 |
+
except Exception as e:
|
70 |
+
raise Exception(f"All download methods failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
def transcribe_audio(self, audio_path, model_size="base"):
|
73 |
model = self.load_model(model_size)
|
|
|
75 |
return result["text"]
|
76 |
|
77 |
def clean_transcript(self, text):
|
|
|
78 |
text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
|
79 |
return re.sub(r'\s+', ' ', text).strip()
|
80 |
|
81 |
def summarize_chunk(self, chunk):
|
82 |
+
return self.summarizer(chunk, max_length=150, min_length=30)[0]['summary_text']
|
|
|
|
|
|
|
83 |
|
84 |
def summarize_text(self, text, chunk_size=1000):
|
|
|
85 |
text = self.clean_transcript(text)
|
86 |
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
|
87 |
|
88 |
+
with ThreadPoolExecutor(max_workers=4) as executor:
|
89 |
summaries = list(executor.map(self.summarize_chunk, chunks))
|
90 |
|
91 |
return "\n".join(summaries)
|
92 |
|
93 |
def extract_key_points(self, text):
|
94 |
+
prompt = f"""Extract 5-7 key points from this transcript. Each point should:
|
95 |
+
- Start with a bullet (-)
|
96 |
+
- Be concise but specific
|
97 |
+
- Include numbers/dates when mentioned
|
|
|
98 |
|
99 |
Transcript:
|
100 |
+
{text[:8000]}
|
101 |
|
102 |
+
Key Points:"""
|
|
|
103 |
|
104 |
+
result = self.summarizer(prompt, max_length=300, min_length=100)[0]['summary_text']
|
105 |
+
return re.sub(r'(^|\n)(?=\w)', '\n- ', result)
|
106 |
+
|
107 |
+
def get_video_id(self, url):
|
108 |
+
return md5(url.encode()).hexdigest()
|
|
|
|
|
109 |
|
110 |
+
def process(self, youtube_url, chunk_size=1000, model_size="base", use_cookies=False):
|
111 |
+
video_id = self.get_video_id(youtube_url)
|
112 |
+
cache_file = f"cache_{video_id}.json"
|
|
|
113 |
|
114 |
+
if os.path.exists(cache_file):
|
115 |
+
with open(cache_file) as f:
|
116 |
+
return json.load(f)
|
117 |
+
|
118 |
+
try:
|
119 |
+
audio_path = self.download_audio(youtube_url, use_cookies)
|
120 |
+
transcript = self.transcribe_audio(audio_path, model_size)
|
121 |
+
|
122 |
+
result = {
|
123 |
+
'summary': self.summarize_text(transcript, chunk_size),
|
124 |
+
'key_points': self.extract_key_points(transcript),
|
125 |
+
'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
|
126 |
+
}
|
127 |
+
|
128 |
+
with open(cache_file, 'w') as f:
|
129 |
+
json.dump(result, f)
|
130 |
+
|
131 |
+
return result
|
132 |
+
|
133 |
+
except Exception as e:
|
134 |
+
return {'error': str(e)}
|