Spaces:

tcy6
/

VisRAG_Pipeline

Running

tcy6 commited on Nov 4, 2024

Commit

029aeaf

1 Parent(s): 1bc6d88

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -60,9 +60,11 @@ def get_image_md5(img: Image.Image):
     hex_digest = hash_md5.hexdigest()
     return hex_digest
-def calculate_md5_from_binary(binary_data):
     hash_md5 = hashlib.md5()
-    hash_md5.update(binary_data)
     return hash_md5.hexdigest()
 @spaces.GPU
@@ -71,16 +73,17 @@ def add_pdf_gradio(pdf_file_path, progress=gr.Progress()):
     model.eval()
     print(f"pdf_file_path: {pdf_file_path}")
-    knowledge_base_name = calculate_md5_from_binary(pdf_file_path)
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
-    with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file:
-        file.write(pdf_file_binary)
     dpi = 200
-    doc = fitz.open("pdf", pdf_file_binary)
     reps_list = []
     images = []
@@ -129,7 +132,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
-    query_with_instruction = "Represent this query for retrieving relavant document: " + query
     with torch.no_grad():
         query_rep = torch.Tensor(encode([query_with_instruction]))
@@ -142,7 +145,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
-    topk_values_np = topk_values.cpu().numpy()
     topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()

     hex_digest = hash_md5.hexdigest()
     return hex_digest
+def calculate_md5_from_pdf_path(pdf_file_path):
     hash_md5 = hashlib.md5()
+    with open(pdf_file_path, "rb") as f:
+        file_content = f.read()
+        hash_md5.update(file_content)
     return hash_md5.hexdigest()
 @spaces.GPU
     model.eval()
     print(f"pdf_file_path: {pdf_file_path}")
+    knowledge_base_name = calculate_md5_from_pdf_path(pdf_file_path)
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
+    with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file1:
+        with open(pdf_file_path, "rb") as file2:
+            file1.write(file2.read())
     dpi = 200
+    doc = fitz.open(pdf_file_path)
     reps_list = []
     images = []
     doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
+    query_with_instruction = "Represent this query for retrieving relevant document: " + query
     with torch.no_grad():
         query_rep = torch.Tensor(encode([query_with_instruction]))
     topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
+    topk_values_np = topk_values.squeeze(0).cpu().numpy()
     topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()