Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -60,9 +60,11 @@ def get_image_md5(img: Image.Image):
|
|
60 |
hex_digest = hash_md5.hexdigest()
|
61 |
return hex_digest
|
62 |
|
63 |
-
def
|
64 |
hash_md5 = hashlib.md5()
|
65 |
-
|
|
|
|
|
66 |
return hash_md5.hexdigest()
|
67 |
|
68 |
@spaces.GPU
|
@@ -71,16 +73,17 @@ def add_pdf_gradio(pdf_file_path, progress=gr.Progress()):
|
|
71 |
model.eval()
|
72 |
print(f"pdf_file_path: {pdf_file_path}")
|
73 |
|
74 |
-
knowledge_base_name =
|
75 |
|
76 |
this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
|
77 |
os.makedirs(this_cache_dir, exist_ok=True)
|
78 |
|
79 |
-
with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as
|
80 |
-
|
|
|
81 |
|
82 |
dpi = 200
|
83 |
-
doc = fitz.open(
|
84 |
|
85 |
reps_list = []
|
86 |
images = []
|
@@ -129,7 +132,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
|
|
129 |
|
130 |
doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
|
131 |
|
132 |
-
query_with_instruction = "Represent this query for retrieving
|
133 |
with torch.no_grad():
|
134 |
query_rep = torch.Tensor(encode([query_with_instruction]))
|
135 |
|
@@ -142,7 +145,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
|
|
142 |
|
143 |
topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
|
144 |
|
145 |
-
topk_values_np = topk_values.cpu().numpy()
|
146 |
|
147 |
topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()
|
148 |
|
|
|
60 |
hex_digest = hash_md5.hexdigest()
|
61 |
return hex_digest
|
62 |
|
63 |
+
def calculate_md5_from_pdf_path(pdf_file_path):
|
64 |
hash_md5 = hashlib.md5()
|
65 |
+
with open(pdf_file_path, "rb") as f:
|
66 |
+
file_content = f.read()
|
67 |
+
hash_md5.update(file_content)
|
68 |
return hash_md5.hexdigest()
|
69 |
|
70 |
@spaces.GPU
|
|
|
73 |
model.eval()
|
74 |
print(f"pdf_file_path: {pdf_file_path}")
|
75 |
|
76 |
+
knowledge_base_name = calculate_md5_from_pdf_path(pdf_file_path)
|
77 |
|
78 |
this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
|
79 |
os.makedirs(this_cache_dir, exist_ok=True)
|
80 |
|
81 |
+
with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file1:
|
82 |
+
with open(pdf_file_path, "rb") as file2:
|
83 |
+
file1.write(file2.read())
|
84 |
|
85 |
dpi = 200
|
86 |
+
doc = fitz.open(pdf_file_path)
|
87 |
|
88 |
reps_list = []
|
89 |
images = []
|
|
|
132 |
|
133 |
doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
|
134 |
|
135 |
+
query_with_instruction = "Represent this query for retrieving relevant document: " + query
|
136 |
with torch.no_grad():
|
137 |
query_rep = torch.Tensor(encode([query_with_instruction]))
|
138 |
|
|
|
145 |
|
146 |
topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
|
147 |
|
148 |
+
topk_values_np = topk_values.squeeze(0).cpu().numpy()
|
149 |
|
150 |
topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()
|
151 |
|