Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,24 +6,16 @@ import streamlit as st
|
|
6 |
import tempfile
|
7 |
import time
|
8 |
import re
|
|
|
9 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
10 |
from langchain_groq import ChatGroq
|
11 |
from langchain.schema import HumanMessage
|
12 |
-
from google.oauth2.credentials import Credentials
|
13 |
-
from google_auth_oauthlib.flow import InstalledAppFlow
|
14 |
-
from google.auth.transport.requests import Request
|
15 |
-
from googleapiclient.discovery import build
|
16 |
-
from googleapiclient.http import MediaFileUpload
|
17 |
-
import pickle
|
18 |
|
19 |
# Constants
|
20 |
MAX_CHUNK_SIZE = 3000 # Characters per chunk
|
21 |
SEGMENT_DURATION = 120 # Seconds for each segment
|
22 |
MODEL_NAME = "deepseek-r1-distill-llama-70b"
|
23 |
|
24 |
-
# Google Drive API setup
|
25 |
-
SCOPES = ['https://www.googleapis.com/auth/drive.file']
|
26 |
-
|
27 |
st.set_page_config(page_title="YouTube Video Processor", layout="wide")
|
28 |
st.title("YouTube Video Processor")
|
29 |
|
@@ -36,75 +28,36 @@ with st.sidebar:
|
|
36 |
["ar", "en", "fr", "es", "de", "it", "ja", "ko", "pt", "ru", "zh"],
|
37 |
index=0)
|
38 |
|
39 |
-
st.
|
40 |
-
st.info("You'll need to authorize this app to upload files to your Google Drive")
|
41 |
-
|
42 |
-
upload_to_drive = st.checkbox("Upload to Google Drive", value=True)
|
43 |
-
|
44 |
-
if upload_to_drive:
|
45 |
-
uploaded_credentials = st.file_uploader("Upload your client_secret.json", type="json",
|
46 |
-
help="Download from Google Cloud Console")
|
47 |
|
48 |
-
# Function to
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
with open(token_path, 'rb') as token:
|
57 |
-
creds = pickle.load(token)
|
58 |
-
|
59 |
-
# If there are no (valid) credentials, let the user log in
|
60 |
-
if not creds or not creds.valid:
|
61 |
-
if creds and creds.expired and creds.refresh_token:
|
62 |
-
creds.refresh(Request())
|
63 |
-
else:
|
64 |
-
if credentials_file:
|
65 |
-
# Save credentials file to temporary location
|
66 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp_file:
|
67 |
-
tmp_file.write(credentials_file.getvalue())
|
68 |
-
credentials_path = tmp_file.name
|
69 |
-
|
70 |
-
flow = InstalledAppFlow.from_client_secrets_file(credentials_path, SCOPES)
|
71 |
-
creds = flow.run_local_server(port=0)
|
72 |
-
os.unlink(credentials_path) # Delete the temp file
|
73 |
-
else:
|
74 |
-
return None
|
75 |
-
|
76 |
-
# Save the credentials for the next run
|
77 |
-
with open(token_path, 'wb') as token:
|
78 |
-
pickle.dump(creds, token)
|
79 |
-
|
80 |
-
return creds
|
81 |
-
|
82 |
-
# Create Drive service
|
83 |
-
def create_drive_service(creds):
|
84 |
-
return build('drive', 'v3', credentials=creds)
|
85 |
-
|
86 |
-
# Function to upload file to Google Drive
|
87 |
-
def upload_to_google_drive(drive_service, file_path, folder_id=None):
|
88 |
-
file_name = os.path.basename(file_path)
|
89 |
-
file_metadata = {'name': file_name}
|
90 |
-
|
91 |
-
if folder_id:
|
92 |
-
file_metadata['parents'] = [folder_id]
|
93 |
-
|
94 |
-
media = MediaFileUpload(file_path, resumable=True)
|
95 |
-
file = drive_service.files().create(body=file_metadata, media_body=media, fields='id').execute()
|
96 |
-
|
97 |
-
return file.get('id')
|
98 |
|
99 |
-
# Function to create
|
100 |
-
def
|
101 |
-
|
102 |
-
|
103 |
-
'mimeType': 'application/vnd.google-apps.folder'
|
104 |
-
}
|
105 |
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
# Video processing functions from your original script
|
110 |
def get_video_transcript(video_id, language):
|
@@ -193,12 +146,12 @@ def extract_questions_from_transcript(transcript, api_key, progress_bar=None):
|
|
193 |
except Exception as e:
|
194 |
st.warning(f"Error extracting questions for chunk {i+1}: {e}")
|
195 |
|
196 |
-
# Remove duplicates and limit to
|
197 |
unique_questions = []
|
198 |
for q in all_questions:
|
199 |
if q not in unique_questions and q:
|
200 |
unique_questions.append(q)
|
201 |
-
if len(unique_questions) >=
|
202 |
break
|
203 |
|
204 |
return unique_questions
|
@@ -364,7 +317,7 @@ if st.button("Process Video"):
|
|
364 |
os.makedirs(final_dir, exist_ok=True)
|
365 |
|
366 |
# Process tab layout
|
367 |
-
tab1, tab2, tab3, tab4, tab5 = st.tabs(["Transcript", "Questions", "Video", "Segments", "
|
368 |
|
369 |
# Tab 1: Extract transcript
|
370 |
with tab1:
|
@@ -423,7 +376,7 @@ if st.button("Process Video"):
|
|
423 |
|
424 |
# Create segments
|
425 |
st.subheader("Creating Segments")
|
426 |
-
segments = create_segment_timestamps(transcript, num_segments=min(
|
427 |
st.write(f"Created {len(segments)} segments")
|
428 |
else:
|
429 |
download_progress.progress(1.0, text="Failed to download video")
|
@@ -457,51 +410,32 @@ if st.button("Process Video"):
|
|
457 |
segment_progress.progress(1.0, text="Failed to create segments")
|
458 |
st.stop()
|
459 |
|
460 |
-
# Tab 5:
|
461 |
with tab5:
|
462 |
-
st.header("
|
463 |
|
464 |
-
if
|
465 |
-
|
466 |
|
467 |
-
#
|
468 |
-
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
st.write(f"Created Google Drive folder: {folder_name}")
|
478 |
-
|
479 |
-
# Upload files
|
480 |
-
uploaded_files = []
|
481 |
-
for i, file_path in enumerate(final_files):
|
482 |
-
upload_progress.progress((i + 1) / len(final_files),
|
483 |
-
text=f"Uploading file {i+1}/{len(final_files)}")
|
484 |
-
|
485 |
-
file_id = upload_to_google_drive(drive_service, file_path, folder_id)
|
486 |
-
if file_id:
|
487 |
-
uploaded_files.append((os.path.basename(file_path), file_id))
|
488 |
-
|
489 |
-
upload_progress.progress(1.0, text=f"Uploaded {len(uploaded_files)} files to Google Drive")
|
490 |
-
|
491 |
-
# Display uploaded files
|
492 |
-
st.subheader("Uploaded Files")
|
493 |
-
for filename, file_id in uploaded_files:
|
494 |
-
st.write(f"- {filename}")
|
495 |
-
|
496 |
-
st.success(f"All {len(uploaded_files)} files have been uploaded to Google Drive in folder '{folder_name}'")
|
497 |
-
else:
|
498 |
-
upload_progress.progress(1.0, text="Failed to authenticate with Google Drive")
|
499 |
-
st.error("Google Drive authentication failed. Please check your credentials file.")
|
500 |
else:
|
501 |
-
st.
|
502 |
|
503 |
-
# Cleanup
|
504 |
-
st.
|
505 |
|
506 |
# Additional info
|
507 |
with st.expander("About this app"):
|
@@ -511,27 +445,11 @@ with st.expander("About this app"):
|
|
511 |
2. Using LLM to identify key questions answered in the video
|
512 |
3. Splitting the video into segments
|
513 |
4. Naming each segment based on the questions
|
514 |
-
5.
|
515 |
|
516 |
To use the app, you need:
|
517 |
- A GROQ API key
|
518 |
- A YouTube video ID
|
519 |
-
- Google Drive credentials (if uploading to Drive)
|
520 |
|
521 |
The app works best with videos that have transcripts available.
|
522 |
-
""")
|
523 |
-
|
524 |
-
# Instructions for Google Drive setup
|
525 |
-
with st.expander("How to set up Google Drive Integration"):
|
526 |
-
st.write("""
|
527 |
-
### Setting up Google Drive API Access
|
528 |
-
|
529 |
-
1. Go to the [Google Cloud Console](https://console.cloud.google.com/)
|
530 |
-
2. Create a new project or select an existing one
|
531 |
-
3. Enable the Google Drive API
|
532 |
-
4. Create OAuth 2.0 credentials (Desktop app type)
|
533 |
-
5. Download the credentials JSON file
|
534 |
-
6. Upload the file to this app when prompted
|
535 |
-
|
536 |
-
After authentication, the app will be able to upload files to your Google Drive.
|
537 |
""")
|
|
|
6 |
import tempfile
|
7 |
import time
|
8 |
import re
|
9 |
+
import base64
|
10 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
11 |
from langchain_groq import ChatGroq
|
12 |
from langchain.schema import HumanMessage
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Constants
|
15 |
MAX_CHUNK_SIZE = 3000 # Characters per chunk
|
16 |
SEGMENT_DURATION = 120 # Seconds for each segment
|
17 |
MODEL_NAME = "deepseek-r1-distill-llama-70b"
|
18 |
|
|
|
|
|
|
|
19 |
st.set_page_config(page_title="YouTube Video Processor", layout="wide")
|
20 |
st.title("YouTube Video Processor")
|
21 |
|
|
|
28 |
["ar", "en", "fr", "es", "de", "it", "ja", "ko", "pt", "ru", "zh"],
|
29 |
index=0)
|
30 |
|
31 |
+
max_segments = st.slider("Maximum number of segments", 5, 30, 15)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
+
# Function to create a download link for a file
|
34 |
+
def get_download_link(file_path, link_text):
|
35 |
+
with open(file_path, 'rb') as f:
|
36 |
+
data = f.read()
|
37 |
+
b64 = base64.b64encode(data).decode()
|
38 |
+
filename = os.path.basename(file_path)
|
39 |
+
href = f'<a href="data:video/mp4;base64,{b64}" download="{filename}">{link_text}</a>'
|
40 |
+
return href
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
# Function to create a download link for all files as ZIP
|
43 |
+
def get_zip_download_link(files, output_name):
|
44 |
+
if not files:
|
45 |
+
return ""
|
|
|
|
|
46 |
|
47 |
+
# Create a temporary zip file
|
48 |
+
zip_path = f"{output_name}.zip"
|
49 |
+
try:
|
50 |
+
cmd = ['zip', '-j', zip_path] + files
|
51 |
+
subprocess.run(cmd, check=True)
|
52 |
+
|
53 |
+
with open(zip_path, 'rb') as f:
|
54 |
+
data = f.read()
|
55 |
+
b64 = base64.b64encode(data).decode()
|
56 |
+
href = f'<a href="data:application/zip;base64,{b64}" download="{output_name}.zip">Download All Segments (ZIP)</a>'
|
57 |
+
return href
|
58 |
+
except Exception as e:
|
59 |
+
st.error(f"Error creating ZIP file: {e}")
|
60 |
+
return ""
|
61 |
|
62 |
# Video processing functions from your original script
|
63 |
def get_video_transcript(video_id, language):
|
|
|
146 |
except Exception as e:
|
147 |
st.warning(f"Error extracting questions for chunk {i+1}: {e}")
|
148 |
|
149 |
+
# Remove duplicates and limit to max_segments questions
|
150 |
unique_questions = []
|
151 |
for q in all_questions:
|
152 |
if q not in unique_questions and q:
|
153 |
unique_questions.append(q)
|
154 |
+
if len(unique_questions) >= max_segments:
|
155 |
break
|
156 |
|
157 |
return unique_questions
|
|
|
317 |
os.makedirs(final_dir, exist_ok=True)
|
318 |
|
319 |
# Process tab layout
|
320 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs(["Transcript", "Questions", "Video", "Segments", "Download"])
|
321 |
|
322 |
# Tab 1: Extract transcript
|
323 |
with tab1:
|
|
|
376 |
|
377 |
# Create segments
|
378 |
st.subheader("Creating Segments")
|
379 |
+
segments = create_segment_timestamps(transcript, num_segments=min(max_segments, len(questions)))
|
380 |
st.write(f"Created {len(segments)} segments")
|
381 |
else:
|
382 |
download_progress.progress(1.0, text="Failed to download video")
|
|
|
410 |
segment_progress.progress(1.0, text="Failed to create segments")
|
411 |
st.stop()
|
412 |
|
413 |
+
# Tab 5: Download segments
|
414 |
with tab5:
|
415 |
+
st.header("Download Video Segments")
|
416 |
|
417 |
+
if final_files:
|
418 |
+
st.subheader("Individual Downloads")
|
419 |
|
420 |
+
# Create 3 columns for better layout
|
421 |
+
cols = st.columns(3)
|
422 |
+
for i, file_path in enumerate(final_files):
|
423 |
+
col_idx = i % 3
|
424 |
+
filename = os.path.basename(file_path)
|
425 |
+
download_link = get_download_link(file_path, f"Download: {filename}")
|
426 |
+
cols[col_idx].markdown(download_link, unsafe_allow_html=True)
|
427 |
|
428 |
+
st.subheader("Download All Segments")
|
429 |
+
zip_name = f"video_segments_{video_id}"
|
430 |
+
zip_link = get_zip_download_link(final_files, zip_name)
|
431 |
+
if zip_link:
|
432 |
+
st.markdown(zip_link, unsafe_allow_html=True)
|
433 |
+
st.info("The zip file contains all processed video segments with descriptive filenames.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
434 |
else:
|
435 |
+
st.warning("No segments available for download.")
|
436 |
|
437 |
+
# Cleanup message
|
438 |
+
st.success("Processing complete! You can download the video segments from the Download tab.")
|
439 |
|
440 |
# Additional info
|
441 |
with st.expander("About this app"):
|
|
|
445 |
2. Using LLM to identify key questions answered in the video
|
446 |
3. Splitting the video into segments
|
447 |
4. Naming each segment based on the questions
|
448 |
+
5. Providing download links for all segments
|
449 |
|
450 |
To use the app, you need:
|
451 |
- A GROQ API key
|
452 |
- A YouTube video ID
|
|
|
453 |
|
454 |
The app works best with videos that have transcripts available.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
455 |
""")
|