Kathir0011 commited on
Commit
14ab511
·
verified ·
1 Parent(s): 68bbcdf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -9
app.py CHANGED
@@ -15,17 +15,12 @@ from langchain.prompts.chat import (
15
 
16
  def get_transcript(video_url):
17
  try:
18
- # Extract video ID from the URL
19
  video_id = video_url.split("v=")[-1].split("&")[0]
20
 
21
- # Fetch transcript
22
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
23
-
24
- # Optional: Format transcript to SRT format
25
- formatter = SRTFormatter()
26
- formatted_transcript = formatter.format_transcript(transcript)
27
-
28
- return formatted_transcript # Or return as plain text
29
 
30
  except Exception as e:
31
  return f"Error fetching transcript: {str(e)}"
@@ -38,9 +33,12 @@ def create_db_from_video_url(video_url, api_key):
38
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
39
 
40
  transcripts = get_transcript(video_url)
 
 
 
 
41
  print(transcripts)
42
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
43
-
44
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
45
  docs = text_splitter.split_documents(transcripts)
46
  print(docs)
 
15
 
16
  def get_transcript(video_url):
17
  try:
 
18
  video_id = video_url.split("v=")[-1].split("&")[0]
19
 
 
20
  transcript = YouTubeTranscriptApi.get_transcript(video_id)
21
+ text = "\n".join([t["text"] for t in transcript])
22
+ print("text1: ->>>>" + text)
23
+ return text # Return transcript as string
 
 
 
24
 
25
  except Exception as e:
26
  return f"Error fetching transcript: {str(e)}"
 
33
  embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
34
 
35
  transcripts = get_transcript(video_url)
36
+
37
+ # Convert transcript string into a Document
38
+ doc = Document(page_content=transcripts)
39
+
40
  print(transcripts)
41
  # cannot provide this directly to the model so we are splitting the transcripts into small chunks
 
42
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
43
  docs = text_splitter.split_documents(transcripts)
44
  print(docs)