Rabbitt-AI commited on
Commit
b512d5e
·
verified ·
1 Parent(s): b1e1d68

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -21
app.py CHANGED
@@ -15,13 +15,18 @@ from rank_bm25 import BM25Okapi
15
  from gensim.models import Word2Vec
16
  from typing import List, Optional, Tuple
17
  import gradio as gr
18
-
19
  logger = logging.getLogger(__name__)
20
  logging.basicConfig(level=logging.INFO)
21
 
22
  api_key = os.getenv("MISTRAL_API_KEY")
23
  client = Mistral(api_key=api_key)
24
 
 
 
 
 
 
25
  def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
26
  embeddings = []
27
  for text in text_list:
@@ -48,30 +53,55 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
48
  return embeddings
49
 
50
  def store_embeddings_in_vector_db(
51
- pdf_path: str,
52
  vector_db_path: str,
53
  annoy_index_path: str,
54
  chunk_size: int = 2048,
55
  overlap: int = 200,
56
  num_trees: int = 10
57
  ):
58
- doc = fitz.open(pdf_path)
59
- all_embeddings = []
60
  all_texts = []
61
- total_pages = doc.page_count
62
- logging.info(f"Processing PDF: {pdf_path} with {total_pages} pages.")
63
-
64
- for page_num in range(total_pages):
65
- page = doc.load_page(page_num)
66
- text = page.get_text()
67
- if text.strip():
68
- chunks = split_text_into_chunks(text, chunk_size, overlap)
69
- embeddings = get_text_embedding_with_rate_limit(chunks)
70
- all_embeddings.extend(embeddings)
71
- all_texts.extend(chunks)
72
- logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
73
- else:
74
- logging.warning(f"No text found on page {page_num + 1}.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  embeddings_np = np.array(all_embeddings).astype('float32')
77
  with open(vector_db_path, "wb") as f:
@@ -327,7 +357,7 @@ def chatbot_interface(file, user_query, response_style):
327
  selected_reranking_methods=selected_reranking_methods_list
328
  ))
329
 
330
- formatted_response = f"Response:\n{response}\n\n"
331
  formatted_response += "Retrieved and Reranked Documents:\n"
332
  for idx, doc_info in enumerate(source_info, start=1):
333
  formatted_response += f"\nDocument {idx}:\n"
@@ -335,7 +365,6 @@ def chatbot_interface(file, user_query, response_style):
335
  formatted_response += f"Retrieval Method: {doc_info['method']}\n"
336
  if 'score' in doc_info:
337
  formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
338
-
339
  return formatted_response
340
 
341
  iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
@@ -351,7 +380,7 @@ with iface:
351
  "Detailed", "Concise", "Creative", "Technical"], label="Response Style"
352
  ),
353
  ],
354
- outputs= gr.Textbox(label="ChanceRAG Response"),
355
  )
356
 
357
  iface.launch(share=True)
 
15
  from gensim.models import Word2Vec
16
  from typing import List, Optional, Tuple
17
  import gradio as gr
18
+ import moviepy.editor as mp
19
  logger = logging.getLogger(__name__)
20
  logging.basicConfig(level=logging.INFO)
21
 
22
  api_key = os.getenv("MISTRAL_API_KEY")
23
  client = Mistral(api_key=api_key)
24
 
25
+ from deepgram import Deepgram
26
+
27
+ dg_api_key = os.getenv("DEEPGRAM_API_KEY")
28
+ deepgram = Deepgram(dg_api_key)
29
+
30
  def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
31
  embeddings = []
32
  for text in text_list:
 
53
  return embeddings
54
 
55
  def store_embeddings_in_vector_db(
56
+ file_path: str,
57
  vector_db_path: str,
58
  annoy_index_path: str,
59
  chunk_size: int = 2048,
60
  overlap: int = 200,
61
  num_trees: int = 10
62
  ):
 
 
63
  all_texts = []
64
+ if file_path.endswith(('.pdf', '.doc', '.docx' , '.pptx' , '.ppt' , '.xls', '.xlsx' , '.txt' )):
65
+ doc = fitz.open(file_path)
66
+ all_embeddings = []
67
+ total_pages = doc.page_count
68
+ logging.info(f"Processing PDF/DOC: {file_path} with {total_pages} pages.")
69
+
70
+ for page_num in range(total_pages):
71
+ page = doc.load_page(page_num)
72
+ text = page.get_text()
73
+ if text.strip():
74
+ chunks = split_text_into_chunks(text, chunk_size, overlap)
75
+ embeddings = get_text_embedding_with_rate_limit(chunks)
76
+ all_embeddings.extend(embeddings)
77
+ all_texts.extend(chunks)
78
+ logging.info(f"Processed page {page_num + 1}/{total_pages}, extracted {len(chunks)} chunks.")
79
+ else:
80
+ logging.warning(f"No text found on page {page_num + 1}.")
81
+ elif file_path.endswith(('.mp3', '.wav', '.m4a')):
82
+ logging.info(f"Processing audio file: {file_path}")
83
+ with open(file_path, 'rb') as audio_file:
84
+ audio_content = audio_file.read()
85
+ response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
86
+ text = response['results']['channels'][0]['alternatives'][0]['transcript']
87
+ chunks = split_text_into_chunks(text, chunk_size, overlap)
88
+ all_embeddings = get_text_embedding_with_rate_limit(chunks)
89
+ all_texts.extend(chunks)
90
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
91
+ logging.info(f"Processing video file: {file_path}")
92
+ video = mp.VideoFileClip(file_path)
93
+ audio_path = "temp_audio.wav"
94
+ video.audio.write_audiofile(audio_path)
95
+ with open(audio_path, 'rb') as audio_file:
96
+ audio_content = audio_file.read()
97
+ response = asyncio.run(deepgram.transcription.prerecorded({'buffer': audio_content, 'mimetype': 'audio/wav'}, {'punctuate': True}))
98
+ text = response['results']['channels'][0]['alternatives'][0]['transcript']
99
+ os.remove(audio_path)
100
+ chunks = split_text_into_chunks(text, chunk_size, overlap)
101
+ all_embeddings = get_text_embedding_with_rate_limit(chunks)
102
+ all_texts.extend(chunks)
103
+ else:
104
+ raise ValueError("Unsupported file format. Please upload a PDF, DOC, DOCX, MP3, WAV, M4A, MP4, AVI, or MOV file.")
105
 
106
  embeddings_np = np.array(all_embeddings).astype('float32')
107
  with open(vector_db_path, "wb") as f:
 
357
  selected_reranking_methods=selected_reranking_methods_list
358
  ))
359
 
360
+ formatted_response = f"# **ChanceRAG Response:**\n\n{response}\n\n"
361
  formatted_response += "Retrieved and Reranked Documents:\n"
362
  for idx, doc_info in enumerate(source_info, start=1):
363
  formatted_response += f"\nDocument {idx}:\n"
 
365
  formatted_response += f"Retrieval Method: {doc_info['method']}\n"
366
  if 'score' in doc_info:
367
  formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
 
368
  return formatted_response
369
 
370
  iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
 
380
  "Detailed", "Concise", "Creative", "Technical"], label="Response Style"
381
  ),
382
  ],
383
+ outputs= gr.Markdown(value="# **ChanceRAG Response**"),
384
  )
385
 
386
  iface.launch(share=True)