kvan1 commited on
Commit
180c914
·
verified ·
1 Parent(s): 5f4aa24

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -0
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #from google.colab import userdata
2
+ import os
3
+ import base64
4
+ import json
5
+ import cv2
6
+ import moviepy.editor as mp
7
+ import gradio as gr
8
+ from pathlib import Path
9
+ from llama_index.core import Settings
10
+ from llama_index.core import StorageContext
11
+ from llama_index.core import SimpleDirectoryReader
12
+ from llama_index.core.indices.multi_modal.base import MultiModalVectorStoreIndex
13
+ from llama_index.embeddings.mistralai import MistralAIEmbedding
14
+ from llama_index.vector_stores.milvus import MilvusVectorStore
15
+
16
+
17
+ # Configure default embedding model
18
+ Settings.embed_model = MistralAIEmbedding(
19
+ "mistral-embed",
20
+ api_key=os.getenv('MISTRAL_API_KEY')
21
+ )
22
+
23
+ # Global variables for session state
24
+ index = None
25
+ metadata = None
26
+
27
+ # Functions for video and audio processing
28
+ def process_video(video_path, output_folder, output_audio_path):
29
+ Path(output_folder).mkdir(parents=True, exist_ok=True)
30
+ video_to_images(video_path, output_folder)
31
+ video_to_audio(video_path, output_audio_path)
32
+ with open(os.path.join(output_folder, "output_text.txt"), "w") as file:
33
+ file.write(audio_to_text(output_audio_path))
34
+ os.remove(output_audio_path)
35
+ # breakpoint()
36
+ return {"Video path": video_path, "Audio path": output_audio_path, "Text": audio_to_text(output_audio_path)}
37
+
38
+ def video_to_images(video_path, output_folder, frame_interval=30):
39
+ cap = cv2.VideoCapture(video_path)
40
+ frame_count = 0
41
+ while cap.isOpened():
42
+ ret, frame = cap.read()
43
+ if not ret:
44
+ break
45
+ if frame_count % frame_interval == 0:
46
+ cv2.imwrite(f"{output_folder}/frame_{frame_count}.jpg", frame)
47
+ frame_count += 1
48
+ cap.release()
49
+
50
+ def audio_to_text(audio_path):
51
+ from openai import OpenAI
52
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
53
+ with open(audio_path, "rb") as audio_file:
54
+ transcript = client.audio.transcriptions.create(
55
+ model="whisper-1",
56
+ file=audio_file
57
+ )
58
+ return transcript.text
59
+
60
+ def video_to_audio(video_path, output_path):
61
+ video = mp.VideoFileClip(video_path)
62
+ video.audio.write_audiofile(output_path)
63
+
64
+ def create_index(output_folder):
65
+ text_store = MilvusVectorStore(
66
+ uri="milvus_local.db",
67
+ collection_name="text_collection",
68
+ overwrite=True,
69
+ dim=1024
70
+ )
71
+ image_store = MilvusVectorStore(
72
+ uri="milvus_local.db",
73
+ collection_name="image_collection",
74
+ overwrite=True,
75
+ dim=512
76
+ )
77
+ storage_context = StorageContext.from_defaults(
78
+ vector_store=text_store,
79
+ image_store=image_store
80
+ )
81
+ documents = SimpleDirectoryReader(output_folder).load_data()
82
+ return MultiModalVectorStoreIndex.from_documents(
83
+ documents,
84
+ storage_context=storage_context
85
+ )
86
+
87
+ # Gradio callbacks
88
+ def process_video_callback(video_file):
89
+ global index, metadata
90
+ # breakpoint()
91
+ output_folder = "output"
92
+ output_audio_path = "output/audio.wav"
93
+ video_path = video_file.name
94
+
95
+ # Process video and create index
96
+ metadata = process_video(video_path, output_folder, output_audio_path)
97
+ # breakpoint()
98
+ index = create_index(output_folder)
99
+ return "Video processed successfully!"
100
+
101
+ def query_video_callback(query):
102
+ global index, metadata
103
+ if not index:
104
+ return "No video index found. Please upload and process a video first."
105
+
106
+ # Retrieve relevant context from the index
107
+ # breakpoint()
108
+ retrieval_result = index.as_retriever().retrieve(query)
109
+ text_contexts = []
110
+ image_documents = []
111
+ for node in retrieval_result:
112
+ if hasattr(node.node, 'image'):
113
+ image_documents.append(node.node)
114
+ else:
115
+ text_contexts.append(node.node.text)
116
+
117
+ # Combine text contexts
118
+ context_str = "\n".join(text_contexts)
119
+ metadata_str = json.dumps(metadata, indent=2)
120
+
121
+ # Generate response
122
+ if image_documents:
123
+ response = f"Text Context: {context_str}\nMetadata: {metadata_str}\nImage Documents Found: {len(image_documents)}"
124
+ else:
125
+ response = "No relevant images found to answer the query."
126
+
127
+ return response
128
+
129
+ # Gradio Interface
130
+ with gr.Blocks() as demo:
131
+ gr.Markdown("## Multi-Modal RAG with Gradio")
132
+ video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi"])
133
+ process_button = gr.Button("Process Video")
134
+ query_input = gr.Textbox(label="Ask a Question About the Video")
135
+ query_button = gr.Button("Submit Query")
136
+ output_text = gr.Textbox(label="Response")
137
+
138
+ process_button.click(process_video_callback, inputs=video_input, outputs=output_text)
139
+ query_button.click(query_video_callback, inputs=query_input, outputs=output_text)
140
+
141
+ demo.launch(debug=True)