Pranav0111 commited on
Commit
3ed2d94
·
verified ·
1 Parent(s): 8cce560

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +91 -59
utils.py CHANGED
@@ -5,43 +5,69 @@ import subprocess
5
  from transformers import pipeline
6
  from concurrent.futures import ThreadPoolExecutor
7
  import re
 
 
 
8
 
9
  class VideoProcessor:
10
  def __init__(self):
11
- self.summarizer = pipeline("summarization",
12
- model="facebook/bart-large-cnn")
13
- self.models = {} # Cache loaded whisper models
14
-
15
  def load_model(self, model_size="base"):
16
  if model_size not in self.models:
17
  self.models[model_size] = whisper.load_model(model_size)
18
  return self.models[model_size]
19
 
20
- def download_audio(self, youtube_url):
21
- """Robust audio download with multiple fallbacks"""
22
- temp_dir = tempfile.mkdtemp()
23
- audio_path = os.path.join(temp_dir, "audio.mp3")
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
- subprocess.run([
27
- "yt-dlp",
28
- "-x", "--audio-format", "mp3",
29
- "--audio-quality", "0",
30
- "--no-playlist",
31
- "--quiet",
32
- "-o", audio_path,
33
- youtube_url
34
- ], check=True, capture_output=True)
35
-
36
- # Find the actual downloaded file
37
- for f in os.listdir(temp_dir):
38
- if f.endswith('.mp3'):
39
- return os.path.join(temp_dir, f)
40
-
41
- raise Exception("Audio file not found after download")
42
-
43
- except subprocess.CalledProcessError as e:
44
- raise Exception(f"Failed to download video: {e.stderr.decode()}")
45
 
46
  def transcribe_audio(self, audio_path, model_size="base"):
47
  model = self.load_model(model_size)
@@ -49,54 +75,60 @@ class VideoProcessor:
49
  return result["text"]
50
 
51
  def clean_transcript(self, text):
52
- """Remove filler words and repetitive phrases"""
53
  text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
54
  return re.sub(r'\s+', ' ', text).strip()
55
 
56
  def summarize_chunk(self, chunk):
57
- return self.summarizer(chunk,
58
- max_length=150,
59
- min_length=30,
60
- do_sample=False)[0]['summary_text']
61
 
62
  def summarize_text(self, text, chunk_size=1000):
63
- """Parallelized summarization"""
64
  text = self.clean_transcript(text)
65
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
66
 
67
- with ThreadPoolExecutor() as executor:
68
  summaries = list(executor.map(self.summarize_chunk, chunks))
69
 
70
  return "\n".join(summaries)
71
 
72
  def extract_key_points(self, text):
73
- """Improved key point extraction"""
74
- prompt = f"""
75
- Extract 5-7 most important key points from this transcript.
76
- Format each point as a bullet starting with -.
77
- Be specific and include numbers/dates when mentioned.
78
 
79
  Transcript:
80
- {text[:10000]} # Limit context window
81
 
82
- Key Points:
83
- """
84
 
85
- result = self.summarizer(prompt,
86
- max_length=300,
87
- min_length=100,
88
- do_sample=False)
89
-
90
- # Post-process to ensure bullet points
91
- return re.sub(r'(^|\n)(?=\w)', '\n- ', result[0]['summary_text'])
92
 
93
- def process(self, youtube_url, chunk_size=1000, model_size="base"):
94
- """Main processing pipeline"""
95
- audio_path = self.download_audio(youtube_url)
96
- transcript = self.transcribe_audio(audio_path, model_size)
97
 
98
- return {
99
- 'summary': self.summarize_text(transcript, chunk_size),
100
- 'key_points': self.extract_key_points(transcript),
101
- 'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
102
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from transformers import pipeline
6
  from concurrent.futures import ThreadPoolExecutor
7
  import re
8
+ import json
9
+ from hashlib import md5
10
+ import browser_cookie3
11
 
12
  class VideoProcessor:
13
  def __init__(self):
14
+ self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
15
+ self.models = {}
16
+ self.cookie_file = "cookies.txt" # Path to your cookies file
17
+
18
  def load_model(self, model_size="base"):
19
  if model_size not in self.models:
20
  self.models[model_size] = whisper.load_model(model_size)
21
  return self.models[model_size]
22
 
23
+ def _download_with_cookies(self, url):
24
+ """Method 1: Download using browser cookies"""
25
+ cmd = [
26
+ "yt-dlp",
27
+ "--cookies", self.cookie_file,
28
+ "--extract-audio",
29
+ "--audio-format", "mp3",
30
+ "--audio-quality", "0",
31
+ "--quiet",
32
+ "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
33
+ url
34
+ ]
35
+ result = subprocess.run(cmd, capture_output=True, text=True)
36
+ if result.returncode != 0:
37
+ raise Exception(f"Cookie download failed: {result.stderr}")
38
+ return self._find_downloaded_file()
39
+
40
+ def _download_with_yt_dlp(self, url):
41
+ """Method 2: Regular download"""
42
+ cmd = [
43
+ "yt-dlp",
44
+ "--extract-audio",
45
+ "--audio-format", "mp3",
46
+ "--quiet",
47
+ "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
48
+ url
49
+ ]
50
+ result = subprocess.run(cmd, capture_output=True, text=True)
51
+ if result.returncode != 0:
52
+ raise Exception(f"Download failed: {result.stderr}")
53
+ return self._find_downloaded_file()
54
+
55
+ def _find_downloaded_file(self):
56
+ """Helper to find downloaded audio file"""
57
+ for root, _, files in os.walk(tempfile.gettempdir()):
58
+ for file in files:
59
+ if file.endswith('.mp3'):
60
+ return os.path.join(root, file)
61
+ raise Exception("Downloaded audio file not found")
62
+
63
+ def download_audio(self, url, use_cookies=False):
64
+ """Robust download with fallback methods"""
65
  try:
66
+ if use_cookies and os.path.exists(self.cookie_file):
67
+ return self._download_with_cookies(url)
68
+ return self._download_with_yt_dlp(url)
69
+ except Exception as e:
70
+ raise Exception(f"All download methods failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
  def transcribe_audio(self, audio_path, model_size="base"):
73
  model = self.load_model(model_size)
 
75
  return result["text"]
76
 
77
  def clean_transcript(self, text):
 
78
  text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
79
  return re.sub(r'\s+', ' ', text).strip()
80
 
81
  def summarize_chunk(self, chunk):
82
+ return self.summarizer(chunk, max_length=150, min_length=30)[0]['summary_text']
 
 
 
83
 
84
  def summarize_text(self, text, chunk_size=1000):
 
85
  text = self.clean_transcript(text)
86
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
87
 
88
+ with ThreadPoolExecutor(max_workers=4) as executor:
89
  summaries = list(executor.map(self.summarize_chunk, chunks))
90
 
91
  return "\n".join(summaries)
92
 
93
  def extract_key_points(self, text):
94
+ prompt = f"""Extract 5-7 key points from this transcript. Each point should:
95
+ - Start with a bullet (-)
96
+ - Be concise but specific
97
+ - Include numbers/dates when mentioned
 
98
 
99
  Transcript:
100
+ {text[:8000]}
101
 
102
+ Key Points:"""
 
103
 
104
+ result = self.summarizer(prompt, max_length=300, min_length=100)[0]['summary_text']
105
+ return re.sub(r'(^|\n)(?=\w)', '\n- ', result)
106
+
107
+ def get_video_id(self, url):
108
+ return md5(url.encode()).hexdigest()
 
 
109
 
110
+ def process(self, youtube_url, chunk_size=1000, model_size="base", use_cookies=False):
111
+ video_id = self.get_video_id(youtube_url)
112
+ cache_file = f"cache_{video_id}.json"
 
113
 
114
+ if os.path.exists(cache_file):
115
+ with open(cache_file) as f:
116
+ return json.load(f)
117
+
118
+ try:
119
+ audio_path = self.download_audio(youtube_url, use_cookies)
120
+ transcript = self.transcribe_audio(audio_path, model_size)
121
+
122
+ result = {
123
+ 'summary': self.summarize_text(transcript, chunk_size),
124
+ 'key_points': self.extract_key_points(transcript),
125
+ 'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
126
+ }
127
+
128
+ with open(cache_file, 'w') as f:
129
+ json.dump(result, f)
130
+
131
+ return result
132
+
133
+ except Exception as e:
134
+ return {'error': str(e)}