Kathir0011 commited on
Commit
68bbcdf
·
verified ·
1 Parent(s): b2ef7b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -20
app.py CHANGED
@@ -1,6 +1,5 @@
1
  import gradio as gr
2
  import os
3
- import yt_dlp
4
 
5
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
6
  from youtube_transcript_api import YouTubeTranscriptApi
@@ -14,24 +13,22 @@ from langchain.prompts.chat import (
14
  )
15
 
16
 
17
- def get_transcript_yt_dlp(video_url):
18
- """Fetches transcript using yt_dlp."""
19
- ydl_opts = {
20
- "writesubtitles": True,
21
- "writeautomaticsub": True,
22
- "skip_download": True,
23
- "subtitleslangs": ["en"], # Fetch English subtitles
24
- }
25
-
26
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
27
- info_dict = ydl.extract_info(video_url, download=False)
28
- subtitles = info_dict.get("subtitles") or info_dict.get("automatic_captions")
29
-
30
- if subtitles and "en" in subtitles:
31
- sub_url = subtitles["en"][0]["url"]
32
- return f"Transcript URL: {sub_url}"
33
- else:
34
- return "No subtitles available!"
35
 
36
 
37
  def create_db_from_video_url(video_url, api_key):
@@ -40,7 +37,7 @@ def create_db_from_video_url(video_url, api_key):
40
  """
41
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
42
 
43
- transcripts = get_transcript_yt_dlp(video_url)
44
  print(transcripts)
45
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
46
 
 
1
  import gradio as gr
2
  import os
 
3
 
4
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
  from youtube_transcript_api import YouTubeTranscriptApi
 
13
  )
14
 
15
 
16
+ def get_transcript(video_url):
17
+ try:
18
+ # Extract video ID from the URL
19
+ video_id = video_url.split("v=")[-1].split("&")[0]
20
+
21
+ # Fetch transcript
22
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
23
+
24
+ # Optional: Format transcript to SRT format
25
+ formatter = SRTFormatter()
26
+ formatted_transcript = formatter.format_transcript(transcript)
27
+
28
+ return formatted_transcript # Or return as plain text
29
+
30
+ except Exception as e:
31
+ return f"Error fetching transcript: {str(e)}"
 
 
32
 
33
 
34
  def create_db_from_video_url(video_url, api_key):
 
37
  """
38
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
39
 
40
+ transcripts = get_transcript(video_url)
41
  print(transcripts)
42
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
43