Spaces:

tcy6
/

VisRAG_Pipeline

Running

App Files Files Community

tcy6 commited on Nov 11, 2024

Commit

2573e4b

1 Parent(s): c5c6250

Update example

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +12 -25
car_owner_manual.pdf +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+car_owner_manual.pdf filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -52,13 +52,6 @@ def encode(text_or_image_list):
     embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
     return embeddings
-def get_image_md5(img: Image.Image):
-    img_byte_array = img.tobytes()
-    hash_md5 = hashlib.md5()
-    hash_md5.update(img_byte_array)
-    hex_digest = hash_md5.hexdigest()
-    return hex_digest
 @spaces.GPU
 def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
     global model, tokenizer
@@ -71,7 +64,7 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
     knowledge_base_name = str(int(time.time()))
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
-    global_image_md5s = []
     for pdf_file_path in pdf_file_list:
         with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
@@ -82,10 +75,11 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
         print(f"Processing {pdf_file_path}")
         dpi = 200
         doc = fitz.open(pdf_file_path)
-        image_md5s = []
         reps_list = []
         images = []
@@ -93,8 +87,6 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
             # with self.lock: # because we hope one 16G gpu only process one image at the same time
             pix = page.get_pixmap(dpi=dpi)
             image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            image_md5 = get_image_md5(image)
-            image_md5s.append(image_md5)
             with torch.no_grad():
                 reps = encode([image])
             reps_list.append(reps)
@@ -102,17 +94,14 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
         for idx in range(len(images)):
             image = images[idx]
-            image_md5 = image_md5s[idx]
-            cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
             image.save(cache_image_path)
-        np.save(os.path.join(this_cache_dir, f"{os.path.basename(pdf_file_path).split('.')[0]}.npy"), reps_list)
-        global_image_md5s.extend(image_md5s)
-    with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
-        for item in global_image_md5s:
-            f.write(item+'\n')
     return knowledge_base_name
@@ -127,10 +116,8 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     if not os.path.exists(target_cache_dir):
         return None
-    md5s = []
-    with open(os.path.join(target_cache_dir, f"md5s.txt"), 'r') as f:
-        for line in f:
-            md5s.append(line.rstrip('\n'))
     doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
     doc_list = sorted(doc_list)
@@ -155,14 +142,14 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     similarities_np = similarities.cpu().numpy()
     print(f"topk_doc_ids_np: {topk_doc_ids_np}, topk_values_np: {topk_values_np}")
-    images_topk = [Image.open(os.path.join(target_cache_dir, f"{md5s[idx]}.png")) for idx in topk_doc_ids_np]
     with open(os.path.join(target_cache_dir, f"q-{query_md5}.json"), 'w') as f:
         f.write(json.dumps(
             {
                 "knowledge_base": knowledge_base,
                 "query": query,
-                "retrived_docs": [os.path.join(target_cache_dir, f"{md5s[idx]}.png") for idx in topk_doc_ids_np]
             }, indent=4, ensure_ascii=False
         ))

     embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
     return embeddings
 @spaces.GPU
 def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
     global model, tokenizer
     knowledge_base_name = str(int(time.time()))
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
+    index2img_filename = []
     for pdf_file_path in pdf_file_list:
         with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
         print(f"Processing {pdf_file_path}")
+        pdf_name = os.path.basename(pdf_file_path)
         dpi = 200
         doc = fitz.open(pdf_file_path)
         reps_list = []
         images = []
             # with self.lock: # because we hope one 16G gpu only process one image at the same time
             pix = page.get_pixmap(dpi=dpi)
             image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
             with torch.no_grad():
                 reps = encode([image])
             reps_list.append(reps)
         for idx in range(len(images)):
             image = images[idx]
+            cache_image_path = os.path.join(this_cache_dir, f"{pdf_name}_{idx}.png")
             image.save(cache_image_path)
+            index2img_filename.append(os.path.basename(cache_image_path))
+        np.save(os.path.join(this_cache_dir, f"{pdf_name.split('.')[0]}.npy"), reps_list)
+    with open(os.path.join(this_cache_dir, f"index2img_filename.txt"), 'w') as f:
+        f.write('\n'.join(index2img_filename))
     return knowledge_base_name
     if not os.path.exists(target_cache_dir):
         return None
+    with open(os.path.join(target_cache_dir, f"index2img_filename.txt"), 'r') as f:
+        index2img_filename = f.read().split('\n')
     doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
     doc_list = sorted(doc_list)
     similarities_np = similarities.cpu().numpy()
     print(f"topk_doc_ids_np: {topk_doc_ids_np}, topk_values_np: {topk_values_np}")
+    images_topk = [Image.open(os.path.join(target_cache_dir, f"{index2img_filename[idx]}.png")) for idx in topk_doc_ids_np]
     with open(os.path.join(target_cache_dir, f"q-{query_md5}.json"), 'w') as f:
         f.write(json.dumps(
             {
                 "knowledge_base": knowledge_base,
                 "query": query,
+                "retrived_docs": [os.path.join(target_cache_dir, f"{index2img_filename[idx]}.png") for idx in topk_doc_ids_np]
             }, indent=4, ensure_ascii=False
         ))

car_owner_manual.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2e0ee68f14306f3050e0729ef0c19988fc1f501ba4b81ad35aa2b254086bac38
+size 12360551