Kathir0011 commited on
Commit
7fcdf2d
·
verified ·
1 Parent(s): 8ce3ee8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -7
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import os
3
 
4
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
  from youtube_transcript_api import YouTubeTranscriptApi
@@ -16,16 +16,25 @@ from langchain.prompts.chat import (
16
 
17
  def get_transcript(video_url):
18
  try:
19
- video_id = video_url.split("v=")[-1].split("&")[0]
 
20
 
 
 
 
 
 
 
21
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
 
 
22
  text = "\n".join([t["text"] for t in transcript])
23
- print("text1: ->>>>" + text)
24
- return text # Return transcript as string
25
 
26
  except Exception as e:
27
  return f"Error fetching transcript: {str(e)}"
28
 
 
29
 
30
  def create_db_from_video_url(video_url, api_key):
31
  """
@@ -36,12 +45,12 @@ def create_db_from_video_url(video_url, api_key):
36
  transcripts = get_transcript(video_url)
37
 
38
  # Convert transcript string into a Document
39
- docs = Document(page_content=transcripts)
40
 
41
- print(transcripts)
42
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
43
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
44
- docs = text_splitter.split_documents(docs)
45
  print(docs)
46
 
47
  db = FAISS.from_documents(docs, embedding=embeddings)
 
1
  import gradio as gr
2
+ import os, re
3
 
4
  from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
5
  from youtube_transcript_api import YouTubeTranscriptApi
 
16
 
17
  def get_transcript(video_url):
18
  try:
19
+ # Use a regular expression to extract video ID from the YouTube URL
20
+ video_id_match = re.search(r"(?:https?://)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})", video_url)
21
 
22
+ if not video_id_match:
23
+ return "Invalid YouTube URL"
24
+
25
+ video_id = video_id_match.group(1)
26
+
27
+ # Fetch the transcript
28
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
29
+
30
+ # Join the transcript text into a single string
31
  text = "\n".join([t["text"] for t in transcript])
32
+ return text # Return the transcript as a string
 
33
 
34
  except Exception as e:
35
  return f"Error fetching transcript: {str(e)}"
36
 
37
+
38
 
39
  def create_db_from_video_url(video_url, api_key):
40
  """
 
45
  transcripts = get_transcript(video_url)
46
 
47
  # Convert transcript string into a Document
48
+ doc_convert = Document(page_content=transcripts)
49
 
50
+ print(doc_convert)
51
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
52
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
53
+ docs = text_splitter.split_documents([doc_convert])
54
  print(docs)
55
 
56
  db = FAISS.from_documents(docs, embedding=embeddings)