tcy6 commited on
Commit
029aeaf
1 Parent(s): 1bc6d88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -60,9 +60,11 @@ def get_image_md5(img: Image.Image):
60
  hex_digest = hash_md5.hexdigest()
61
  return hex_digest
62
 
63
- def calculate_md5_from_binary(binary_data):
64
  hash_md5 = hashlib.md5()
65
- hash_md5.update(binary_data)
 
 
66
  return hash_md5.hexdigest()
67
 
68
  @spaces.GPU
@@ -71,16 +73,17 @@ def add_pdf_gradio(pdf_file_path, progress=gr.Progress()):
71
  model.eval()
72
  print(f"pdf_file_path: {pdf_file_path}")
73
 
74
- knowledge_base_name = calculate_md5_from_binary(pdf_file_path)
75
 
76
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
77
  os.makedirs(this_cache_dir, exist_ok=True)
78
 
79
- with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file:
80
- file.write(pdf_file_binary)
 
81
 
82
  dpi = 200
83
- doc = fitz.open("pdf", pdf_file_binary)
84
 
85
  reps_list = []
86
  images = []
@@ -129,7 +132,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
129
 
130
  doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
131
 
132
- query_with_instruction = "Represent this query for retrieving relavant document: " + query
133
  with torch.no_grad():
134
  query_rep = torch.Tensor(encode([query_with_instruction]))
135
 
@@ -142,7 +145,7 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
142
 
143
  topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
144
 
145
- topk_values_np = topk_values.cpu().numpy()
146
 
147
  topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()
148
 
 
60
  hex_digest = hash_md5.hexdigest()
61
  return hex_digest
62
 
63
+ def calculate_md5_from_pdf_path(pdf_file_path):
64
  hash_md5 = hashlib.md5()
65
+ with open(pdf_file_path, "rb") as f:
66
+ file_content = f.read()
67
+ hash_md5.update(file_content)
68
  return hash_md5.hexdigest()
69
 
70
  @spaces.GPU
 
73
  model.eval()
74
  print(f"pdf_file_path: {pdf_file_path}")
75
 
76
+ knowledge_base_name = calculate_md5_from_pdf_path(pdf_file_path)
77
 
78
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
79
  os.makedirs(this_cache_dir, exist_ok=True)
80
 
81
+ with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file1:
82
+ with open(pdf_file_path, "rb") as file2:
83
+ file1.write(file2.read())
84
 
85
  dpi = 200
86
+ doc = fitz.open(pdf_file_path)
87
 
88
  reps_list = []
89
  images = []
 
132
 
133
  doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
134
 
135
+ query_with_instruction = "Represent this query for retrieving relevant document: " + query
136
  with torch.no_grad():
137
  query_rep = torch.Tensor(encode([query_with_instruction]))
138
 
 
145
 
146
  topk_values, topk_doc_ids = torch.topk(similarities, k=topk)
147
 
148
+ topk_values_np = topk_values.squeeze(0).cpu().numpy()
149
 
150
  topk_doc_ids_np = topk_doc_ids.squeeze(0).cpu().numpy()
151