Spaces:

kvan1
/

mmRAG

Sleeping

App Files Files Community

kvan1 commited on Dec 16, 2024

Commit

180c914

verified ·

1 Parent(s): 5f4aa24

Create app.py

Browse files

Files changed (1) hide show

app.py +141 -0

app.py ADDED Viewed

	@@ -0,0 +1,141 @@

+#from google.colab import userdata
+import os
+import base64
+import json
+import cv2
+import moviepy.editor as mp
+import gradio as gr
+from pathlib import Path
+from llama_index.core import Settings
+from llama_index.core import StorageContext
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.indices.multi_modal.base import MultiModalVectorStoreIndex
+from llama_index.embeddings.mistralai import MistralAIEmbedding
+from llama_index.vector_stores.milvus import MilvusVectorStore
+# Configure default embedding model
+Settings.embed_model = MistralAIEmbedding(
+    "mistral-embed",
+    api_key=os.getenv('MISTRAL_API_KEY')
+)
+# Global variables for session state
+index = None
+metadata = None
+# Functions for video and audio processing
+def process_video(video_path, output_folder, output_audio_path):
+    Path(output_folder).mkdir(parents=True, exist_ok=True)
+    video_to_images(video_path, output_folder)
+    video_to_audio(video_path, output_audio_path)
+    with open(os.path.join(output_folder, "output_text.txt"), "w") as file:
+        file.write(audio_to_text(output_audio_path))
+    os.remove(output_audio_path)
+#    breakpoint()
+    return {"Video path": video_path, "Audio path": output_audio_path, "Text": audio_to_text(output_audio_path)}
+def video_to_images(video_path, output_folder, frame_interval=30):
+    cap = cv2.VideoCapture(video_path)
+    frame_count = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if frame_count % frame_interval == 0:
+            cv2.imwrite(f"{output_folder}/frame_{frame_count}.jpg", frame)
+        frame_count += 1
+    cap.release()
+def audio_to_text(audio_path):
+    from openai import OpenAI
+    client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+    with open(audio_path, "rb") as audio_file:
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1",
+            file=audio_file
+        )
+    return transcript.text
+def video_to_audio(video_path, output_path):
+    video = mp.VideoFileClip(video_path)
+    video.audio.write_audiofile(output_path)
+def create_index(output_folder):
+    text_store = MilvusVectorStore(
+        uri="milvus_local.db",
+        collection_name="text_collection",
+        overwrite=True,
+        dim=1024
+    )
+    image_store = MilvusVectorStore(
+        uri="milvus_local.db",
+        collection_name="image_collection",
+        overwrite=True,
+        dim=512
+    )
+    storage_context = StorageContext.from_defaults(
+        vector_store=text_store,
+        image_store=image_store
+    )
+    documents = SimpleDirectoryReader(output_folder).load_data()
+    return MultiModalVectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context
+    )
+# Gradio callbacks
+def process_video_callback(video_file):
+    global index, metadata
+#    breakpoint()
+    output_folder = "output"
+    output_audio_path = "output/audio.wav"
+    video_path = video_file.name
+    # Process video and create index
+    metadata = process_video(video_path, output_folder, output_audio_path)
+#    breakpoint()
+    index = create_index(output_folder)
+    return "Video processed successfully!"
+def query_video_callback(query):
+    global index, metadata
+    if not index:
+        return "No video index found. Please upload and process a video first."
+    # Retrieve relevant context from the index
+#    breakpoint()
+    retrieval_result = index.as_retriever().retrieve(query)
+    text_contexts = []
+    image_documents = []
+    for node in retrieval_result:
+        if hasattr(node.node, 'image'):
+            image_documents.append(node.node)
+        else:
+            text_contexts.append(node.node.text)
+    # Combine text contexts
+    context_str = "\n".join(text_contexts)
+    metadata_str = json.dumps(metadata, indent=2)
+    # Generate response
+    if image_documents:
+        response = f"Text Context: {context_str}\nMetadata: {metadata_str}\nImage Documents Found: {len(image_documents)}"
+    else:
+        response = "No relevant images found to answer the query."
+    return response
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Multi-Modal RAG with Gradio")
+    video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi"])
+    process_button = gr.Button("Process Video")
+    query_input = gr.Textbox(label="Ask a Question About the Video")
+    query_button = gr.Button("Submit Query")
+    output_text = gr.Textbox(label="Response")
+    process_button.click(process_video_callback, inputs=video_input, outputs=output_text)
+    query_button.click(query_video_callback, inputs=query_input, outputs=output_text)
+demo.launch(debug=True)