Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,13 +15,18 @@ from rank_bm25 import BM25Okapi
|
|
15 |
from gensim.models import Word2Vec
|
16 |
from typing import List, Optional, Tuple
|
17 |
import gradio as gr
|
18 |
-
|
19 |
logger = logging.getLogger(__name__)
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
|
22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
23 |
client = Mistral(api_key=api_key)
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
|
26 |
embeddings = []
|
27 |
for text in text_list:
|
@@ -48,30 +53,55 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
|
|
48 |
return embeddings
|
49 |
|
50 |
def store_embeddings_in_vector_db(
|
51 |
-
|
52 |
vector_db_path: str,
|
53 |
annoy_index_path: str,
|
54 |
chunk_size: int = 2048,
|
55 |
overlap: int = 200,
|
56 |
num_trees: int = 10
|
57 |
):
|
58 |
-
doc = fitz.open(pdf_path)
|
59 |
-
all_embeddings = []
|
60 |
all_texts = []
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
embeddings_np = np.array(all_embeddings).astype('float32')
|
77 |
with open(vector_db_path, "wb") as f:
|
@@ -327,7 +357,7 @@ def chatbot_interface(file, user_query, response_style):
|
|
327 |
selected_reranking_methods=selected_reranking_methods_list
|
328 |
))
|
329 |
|
330 |
-
formatted_response = f"Response
|
331 |
formatted_response += "Retrieved and Reranked Documents:\n"
|
332 |
for idx, doc_info in enumerate(source_info, start=1):
|
333 |
formatted_response += f"\nDocument {idx}:\n"
|
@@ -335,7 +365,6 @@ def chatbot_interface(file, user_query, response_style):
|
|
335 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
336 |
if 'score' in doc_info:
|
337 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
338 |
-
|
339 |
return formatted_response
|
340 |
|
341 |
iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
|
@@ -351,7 +380,7 @@ with iface:
|
|
351 |
"Detailed", "Concise", "Creative", "Technical"], label="Response Style"
|
352 |
),
|
353 |
],
|
354 |
-
outputs= gr.
|
355 |
)
|
356 |
|
357 |
iface.launch(share=True)
|
|
|
15 |
from gensim.models import Word2Vec
|
16 |
from typing import List, Optional, Tuple
|
17 |
import gradio as gr
|
18 |
+
import moviepy.editor as mp
|
19 |
logger = logging.getLogger(__name__)
|
20 |
logging.basicConfig(level=logging.INFO)
|
21 |
|
22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
23 |
client = Mistral(api_key=api_key)
|
24 |
|
25 |
+
from deepgram import Deepgram
|
26 |
+
|
27 |
+
dg_api_key = os.getenv("DEEPGRAM_API_KEY")
|
28 |
+
deepgram = Deepgram(dg_api_key)
|
29 |
+
|
30 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
|
31 |
embeddings = []
|
32 |
for text in text_list:
|
|
|
53 |
return embeddings
|
54 |
|
55 |
def store_embeddings_in_vector_db(
|
56 |
+
file_path: str,
|
57 |
vector_db_path: str,
|
58 |
annoy_index_path: str,
|
59 |
chunk_size: int = 2048,
|
60 |
overlap: int = 200,
|
61 |
num_trees: int = 10
|
62 |
):
|
|
|
|
|
63 |
all_texts = []
|
64 |
+
if file_path.endswith(('.pdf', '.doc', '.docx' , '.pptx' , '.ppt' , '.xls', '.xlsx' , '.txt' )):
|
65 |
+
doc = fitz.open(file_path)
|
66 |
+
all_embeddings = []
|
67 |
+
total_pages = doc.page_count
|
68 |
+
logging.info(f"Processing PDF/DOC: {file_path} with {total_pages} pages.")
|
69 |
+
|
70 |
+
for page_num in range(total_pages):
|
71 |
+
page = doc.load_page(page_num)
|
72 |
+
text = page.get_text()
|
73 |
+
if text.strip():
|
74 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
75 |
+
embeddings = get_text_embedding_with_rate_limit(chunks)
|
76 |
+
all_embeddings.extend(embeddings)
|
77 |
+
all_texts.extend(chunks)
|
78 |
+
logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
|
79 |
+
else:
|
80 |
+
logging.warning(f"No text found on page {page_num + 1}.")
|
81 |
+
elif file_path.endswith(('.mp3', '.wav', '.m4a')):
|
82 |
+
logging.info(f"Processing audio file: {file_path}")
|
83 |
+
with open(file_path, 'rb') as audio_file:
|
84 |
+
audio_content = audio_file.read()
|
85 |
+
response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
|
86 |
+
text = response['results']['channels'][0]['alternatives'][0]['transcript']
|
87 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
88 |
+
all_embeddings = get_text_embedding_with_rate_limit(chunks)
|
89 |
+
all_texts.extend(chunks)
|
90 |
+
elif file_path.endswith(('.mp4', '.avi', '.mov')):
|
91 |
+
logging.info(f"Processing video file: {file_path}")
|
92 |
+
video = mp.VideoFileClip(file_path)
|
93 |
+
audio_path = "temp_audio.wav"
|
94 |
+
video.audio.write_audiofile(audio_path)
|
95 |
+
with open(audio_path, 'rb') as audio_file:
|
96 |
+
audio_content = audio_file.read()
|
97 |
+
response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
|
98 |
+
text = response['results']['channels'][0]['alternatives'][0]['transcript']
|
99 |
+
os.remove(audio_path)
|
100 |
+
chunks = split_text_into_chunks(text, chunk_size, overlap)
|
101 |
+
all_embeddings = get_text_embedding_with_rate_limit(chunks)
|
102 |
+
all_texts.extend(chunks)
|
103 |
+
else:
|
104 |
+
raise ValueError("Unsupported file format. Please upload a PDF, DOC, DOCX, MP3, WAV, M4A, MP4, AVI, or MOV file.")
|
105 |
|
106 |
embeddings_np = np.array(all_embeddings).astype('float32')
|
107 |
with open(vector_db_path, "wb") as f:
|
|
|
357 |
selected_reranking_methods=selected_reranking_methods_list
|
358 |
))
|
359 |
|
360 |
+
formatted_response = f"# **ChanceRAG Response:**\n\n{response}\n\n"
|
361 |
formatted_response += "Retrieved and Reranked Documents:\n"
|
362 |
for idx, doc_info in enumerate(source_info, start=1):
|
363 |
formatted_response += f"\nDocument {idx}:\n"
|
|
|
365 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
366 |
if 'score' in doc_info:
|
367 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
|
|
368 |
return formatted_response
|
369 |
|
370 |
iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
|
|
|
380 |
"Detailed", "Concise", "Creative", "Technical"], label="Response Style"
|
381 |
),
|
382 |
],
|
383 |
+
outputs= gr.Markdown(value="# **ChanceRAG Response**"),
|
384 |
)
|
385 |
|
386 |
iface.launch(share=True)
|