Spaces:

Kathir0011
/

YouTube_Video_Assistant

Running

Kathir0011 commited on Jan 29

Commit

14ab511

verified ·

1 Parent(s): 68bbcdf

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,17 +15,12 @@ from langchain.prompts.chat import (
 def get_transcript(video_url):
     try:
-        # Extract video ID from the URL
         video_id = video_url.split("v=")[-1].split("&")[0]
-        # Fetch transcript
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
-        # Optional: Format transcript to SRT format
-        formatter = SRTFormatter()
-        formatted_transcript = formatter.format_transcript(transcript)
-        return formatted_transcript  # Or return as plain text
     except Exception as e:
         return f"Error fetching transcript: {str(e)}"
@@ -38,9 +33,12 @@ def create_db_from_video_url(video_url, api_key):
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
     transcripts = get_transcript(video_url)
     print(transcripts)
     # cannot provide this directly to the model so we are splitting the transcripts into small chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     docs = text_splitter.split_documents(transcripts)
     print(docs)

 def get_transcript(video_url):
     try:
         video_id = video_url.split("v=")[-1].split("&")[0]
         transcript = YouTubeTranscriptApi.get_transcript(video_id)
+        text = "\n".join([t["text"] for t in transcript])
+        print("text1: ->>>>" + text)
+        return text  # Return transcript as string
     except Exception as e:
         return f"Error fetching transcript: {str(e)}"
     embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004", google_api_key=api_key)
     transcripts = get_transcript(video_url)
+    # Convert transcript string into a Document
+    doc = Document(page_content=transcripts)
     print(transcripts)
     # cannot provide this directly to the model so we are splitting the transcripts into small chunks
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     docs = text_splitter.split_documents(transcripts)
     print(docs)