File size: 4,688 Bytes
180c914 ef24b5d 180c914 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
#from google.colab import userdata
import os
import base64
import json
import cv2
import moviepy.editor as mp
import gradio as gr
from pathlib import Path
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core.indices.multi_modal.base import MultiModalVectorStoreIndex
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.vector_stores.milvus import MilvusVectorStore
# Configure default embedding model
Settings.embed_model = MistralAIEmbedding(
"mistral-embed",
api_key=os.getenv('MISTRAL_API_KEY')
)
# Global variables for session state
index = None
metadata = None
# Functions for video and audio processing
def process_video(video_path, output_folder, output_audio_path):
Path(output_folder).mkdir(parents=True, exist_ok=True)
video_to_images(video_path, output_folder)
video_to_audio(video_path, output_audio_path)
with open(os.path.join(output_folder, "output_text.txt"), "w") as file:
file.write(audio_to_text(output_audio_path))
os.remove(output_audio_path)
# breakpoint()
response = f"Video path: {video_path}\nAudio path: {output_audio_path}\nText: {output_audio_path}"
return response
def video_to_images(video_path, output_folder, frame_interval=30):
cap = cv2.VideoCapture(video_path)
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
cv2.imwrite(f"{output_folder}/frame_{frame_count}.jpg", frame)
frame_count += 1
cap.release()
def audio_to_text(audio_path):
from openai import OpenAI
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
with open(audio_path, "rb") as audio_file:
transcript = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
return transcript.text
def video_to_audio(video_path, output_path):
video = mp.VideoFileClip(video_path)
video.audio.write_audiofile(output_path)
def create_index(output_folder):
text_store = MilvusVectorStore(
uri="milvus_local.db",
collection_name="text_collection",
overwrite=True,
dim=1024
)
image_store = MilvusVectorStore(
uri="milvus_local.db",
collection_name="image_collection",
overwrite=True,
dim=512
)
storage_context = StorageContext.from_defaults(
vector_store=text_store,
image_store=image_store
)
documents = SimpleDirectoryReader(output_folder).load_data()
return MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context
)
# Gradio callbacks
def process_video_callback(video_file):
global index, metadata
# breakpoint()
output_folder = "output"
output_audio_path = "output/audio.wav"
video_path = video_file.name
# Process video and create index
metadata = process_video(video_path, output_folder, output_audio_path)
# breakpoint()
index = create_index(output_folder)
return "Video processed successfully!"
def query_video_callback(query):
global index, metadata
if not index:
return "No video index found. Please upload and process a video first."
# Retrieve relevant context from the index
# breakpoint()
retrieval_result = index.as_retriever().retrieve(query)
text_contexts = []
image_documents = []
for node in retrieval_result:
if hasattr(node.node, 'image'):
image_documents.append(node.node)
else:
text_contexts.append(node.node.text)
# Combine text contexts
context_str = "\n".join(text_contexts)
metadata_str = json.dumps(metadata, indent=2)
# Generate response
if image_documents:
response = f"Text Context: {context_str}\nMetadata: {metadata_str}\nImage Documents Found: {len(image_documents)}"
else:
response = "No relevant images found to answer the query."
return response
# Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("## Multi-Modal RAG with Gradio")
video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi"])
process_button = gr.Button("Process Video")
query_input = gr.Textbox(label="Ask a Question About the Video")
query_button = gr.Button("Submit Query")
output_text = gr.Textbox(label="Response")
process_button.click(process_video_callback, inputs=video_input, outputs=output_text)
query_button.click(query_video_callback, inputs=query_input, outputs=output_text)
demo.launch(debug=True) |