File size: 4,688 Bytes
180c914
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef24b5d
 
180c914
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#from google.colab import userdata
import os
import base64
import json
import cv2
import moviepy.editor as mp
import gradio as gr
from pathlib import Path
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader
from llama_index.core.indices.multi_modal.base import MultiModalVectorStoreIndex
from llama_index.embeddings.mistralai import MistralAIEmbedding
from llama_index.vector_stores.milvus import MilvusVectorStore


# Configure default embedding model
Settings.embed_model = MistralAIEmbedding(
    "mistral-embed", 
    api_key=os.getenv('MISTRAL_API_KEY')
)

# Global variables for session state
index = None
metadata = None

# Functions for video and audio processing
def process_video(video_path, output_folder, output_audio_path):
    Path(output_folder).mkdir(parents=True, exist_ok=True)
    video_to_images(video_path, output_folder)
    video_to_audio(video_path, output_audio_path)
    with open(os.path.join(output_folder, "output_text.txt"), "w") as file:
        file.write(audio_to_text(output_audio_path))
    os.remove(output_audio_path)
#    breakpoint()
    response = f"Video path: {video_path}\nAudio path: {output_audio_path}\nText: {output_audio_path}"
    return response

def video_to_images(video_path, output_folder, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            cv2.imwrite(f"{output_folder}/frame_{frame_count}.jpg", frame)
        frame_count += 1
    cap.release()

def audio_to_text(audio_path):
    from openai import OpenAI
    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
    with open(audio_path, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file
        )
    return transcript.text

def video_to_audio(video_path, output_path):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(output_path)

def create_index(output_folder):
    text_store = MilvusVectorStore(
        uri="milvus_local.db",
        collection_name="text_collection",
        overwrite=True,
        dim=1024
    )
    image_store = MilvusVectorStore(
        uri="milvus_local.db",
        collection_name="image_collection",
        overwrite=True,
        dim=512
    )
    storage_context = StorageContext.from_defaults(
        vector_store=text_store,
        image_store=image_store
    )
    documents = SimpleDirectoryReader(output_folder).load_data()
    return MultiModalVectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context
    )

# Gradio callbacks
def process_video_callback(video_file):
    global index, metadata
#    breakpoint()
    output_folder = "output"
    output_audio_path = "output/audio.wav"
    video_path = video_file.name

    # Process video and create index
    metadata = process_video(video_path, output_folder, output_audio_path)
#    breakpoint()
    index = create_index(output_folder)
    return "Video processed successfully!"

def query_video_callback(query):
    global index, metadata
    if not index:
        return "No video index found. Please upload and process a video first."

    # Retrieve relevant context from the index
#    breakpoint()
    retrieval_result = index.as_retriever().retrieve(query)
    text_contexts = []
    image_documents = []
    for node in retrieval_result:
        if hasattr(node.node, 'image'):
            image_documents.append(node.node)
        else:
            text_contexts.append(node.node.text)

    # Combine text contexts
    context_str = "\n".join(text_contexts)
    metadata_str = json.dumps(metadata, indent=2)

    # Generate response
    if image_documents:
        response = f"Text Context: {context_str}\nMetadata: {metadata_str}\nImage Documents Found: {len(image_documents)}"
    else:
        response = "No relevant images found to answer the query."

    return response

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## Multi-Modal RAG with Gradio")
    video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi"])
    process_button = gr.Button("Process Video")
    query_input = gr.Textbox(label="Ask a Question About the Video")
    query_button = gr.Button("Submit Query")
    output_text = gr.Textbox(label="Response")

    process_button.click(process_video_callback, inputs=video_input, outputs=output_text)
    query_button.click(query_video_callback, inputs=query_input, outputs=output_text)

demo.launch(debug=True)