Spaces:

Pranav0111
/

youtube-summarizer

Sleeping

App Files Files Community

Pranav0111 commited on May 31

Commit

3ed2d94

verified ·

1 Parent(s): 8cce560

Update utils.py

Browse files

Files changed (1) hide show

utils.py +91 -59

utils.py CHANGED Viewed

@@ -5,43 +5,69 @@ import subprocess
 from transformers import pipeline
 from concurrent.futures import ThreadPoolExecutor
 import re
 class VideoProcessor:
     def __init__(self):
-        self.summarizer = pipeline("summarization",
-                                 model="facebook/bart-large-cnn")
-        self.models = {}  # Cache loaded whisper models
     def load_model(self, model_size="base"):
         if model_size not in self.models:
             self.models[model_size] = whisper.load_model(model_size)
         return self.models[model_size]
-    def download_audio(self, youtube_url):
-        """Robust audio download with multiple fallbacks"""
-        temp_dir = tempfile.mkdtemp()
-        audio_path = os.path.join(temp_dir, "audio.mp3")
         try:
-            subprocess.run([
-                "yt-dlp",
-                "-x", "--audio-format", "mp3",
-                "--audio-quality", "0",
-                "--no-playlist",
-                "--quiet",
-                "-o", audio_path,
-                youtube_url
-            ], check=True, capture_output=True)
-            # Find the actual downloaded file
-            for f in os.listdir(temp_dir):
-                if f.endswith('.mp3'):
-                    return os.path.join(temp_dir, f)
-            raise Exception("Audio file not found after download")
-        except subprocess.CalledProcessError as e:
-            raise Exception(f"Failed to download video: {e.stderr.decode()}")
     def transcribe_audio(self, audio_path, model_size="base"):
         model = self.load_model(model_size)
@@ -49,54 +75,60 @@ class VideoProcessor:
         return result["text"]
     def clean_transcript(self, text):
-        """Remove filler words and repetitive phrases"""
         text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
         return re.sub(r'\s+', ' ', text).strip()
     def summarize_chunk(self, chunk):
-        return self.summarizer(chunk,
-                             max_length=150,
-                             min_length=30,
-                             do_sample=False)[0]['summary_text']
     def summarize_text(self, text, chunk_size=1000):
-        """Parallelized summarization"""
         text = self.clean_transcript(text)
         chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-        with ThreadPoolExecutor() as executor:
             summaries = list(executor.map(self.summarize_chunk, chunks))
         return "\n".join(summaries)
     def extract_key_points(self, text):
-        """Improved key point extraction"""
-        prompt = f"""
-        Extract 5-7 most important key points from this transcript.
-        Format each point as a bullet starting with -.
-        Be specific and include numbers/dates when mentioned.
         Transcript:
-        {text[:10000]}  # Limit context window
-        Key Points:
-        """
-        result = self.summarizer(prompt,
-                               max_length=300,
-                               min_length=100,
-                               do_sample=False)
-        # Post-process to ensure bullet points
-        return re.sub(r'(^|\n)(?=\w)', '\n- ', result[0]['summary_text'])
-    def process(self, youtube_url, chunk_size=1000, model_size="base"):
-        """Main processing pipeline"""
-        audio_path = self.download_audio(youtube_url)
-        transcript = self.transcribe_audio(audio_path, model_size)
-        return {
-            'summary': self.summarize_text(transcript, chunk_size),
-            'key_points': self.extract_key_points(transcript),
-            'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
-        }

 from transformers import pipeline
 from concurrent.futures import ThreadPoolExecutor
 import re
+import json
+from hashlib import md5
+import browser_cookie3
 class VideoProcessor:
     def __init__(self):
+        self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+        self.models = {}
+        self.cookie_file = "cookies.txt"  # Path to your cookies file
     def load_model(self, model_size="base"):
         if model_size not in self.models:
             self.models[model_size] = whisper.load_model(model_size)
         return self.models[model_size]
+    def _download_with_cookies(self, url):
+        """Method 1: Download using browser cookies"""
+        cmd = [
+            "yt-dlp",
+            "--cookies", self.cookie_file,
+            "--extract-audio",
+            "--audio-format", "mp3",
+            "--audio-quality", "0",
+            "--quiet",
+            "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
+            url
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Cookie download failed: {result.stderr}")
+        return self._find_downloaded_file()
+    def _download_with_yt_dlp(self, url):
+        """Method 2: Regular download"""
+        cmd = [
+            "yt-dlp",
+            "--extract-audio",
+            "--audio-format", "mp3",
+            "--quiet",
+            "-o", os.path.join(tempfile.mkdtemp(), "audio.%(ext)s"),
+            url
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Download failed: {result.stderr}")
+        return self._find_downloaded_file()
+    def _find_downloaded_file(self):
+        """Helper to find downloaded audio file"""
+        for root, _, files in os.walk(tempfile.gettempdir()):
+            for file in files:
+                if file.endswith('.mp3'):
+                    return os.path.join(root, file)
+        raise Exception("Downloaded audio file not found")
+    def download_audio(self, url, use_cookies=False):
+        """Robust download with fallback methods"""
         try:
+            if use_cookies and os.path.exists(self.cookie_file):
+                return self._download_with_cookies(url)
+            return self._download_with_yt_dlp(url)
+        except Exception as e:
+            raise Exception(f"All download methods failed: {str(e)}")
     def transcribe_audio(self, audio_path, model_size="base"):
         model = self.load_model(model_size)
         return result["text"]
     def clean_transcript(self, text):
         text = re.sub(r'\b(um|uh|like|you know)\b', '', text, flags=re.IGNORECASE)
         return re.sub(r'\s+', ' ', text).strip()
     def summarize_chunk(self, chunk):
+        return self.summarizer(chunk, max_length=150, min_length=30)[0]['summary_text']
     def summarize_text(self, text, chunk_size=1000):
         text = self.clean_transcript(text)
         chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+        with ThreadPoolExecutor(max_workers=4) as executor:
             summaries = list(executor.map(self.summarize_chunk, chunks))
         return "\n".join(summaries)
     def extract_key_points(self, text):
+        prompt = f"""Extract 5-7 key points from this transcript. Each point should:
+        - Start with a bullet (-)
+        - Be concise but specific
+        - Include numbers/dates when mentioned
         Transcript:
+        {text[:8000]}
+        Key Points:"""
+        result = self.summarizer(prompt, max_length=300, min_length=100)[0]['summary_text']
+        return re.sub(r'(^|\n)(?=\w)', '\n- ', result)
+    def get_video_id(self, url):
+        return md5(url.encode()).hexdigest()
+    def process(self, youtube_url, chunk_size=1000, model_size="base", use_cookies=False):
+        video_id = self.get_video_id(youtube_url)
+        cache_file = f"cache_{video_id}.json"
+        if os.path.exists(cache_file):
+            with open(cache_file) as f:
+                return json.load(f)
+        try:
+            audio_path = self.download_audio(youtube_url, use_cookies)
+            transcript = self.transcribe_audio(audio_path, model_size)
+            result = {
+                'summary': self.summarize_text(transcript, chunk_size),
+                'key_points': self.extract_key_points(transcript),
+                'transcript': transcript[:2000] + ("..." if len(transcript) > 2000 else "")
+            }
+            with open(cache_file, 'w') as f:
+                json.dump(result, f)
+            return result
+        except Exception as e:
+            return {'error': str(e)}