tcy6 commited on
Commit
2573e4b
1 Parent(s): c5c6250

Update example

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. app.py +12 -25
  3. car_owner_manual.pdf +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ car_owner_manual.pdf filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -52,13 +52,6 @@ def encode(text_or_image_list):
52
  embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
53
  return embeddings
54
 
55
- def get_image_md5(img: Image.Image):
56
- img_byte_array = img.tobytes()
57
- hash_md5 = hashlib.md5()
58
- hash_md5.update(img_byte_array)
59
- hex_digest = hash_md5.hexdigest()
60
- return hex_digest
61
-
62
  @spaces.GPU
63
  def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
64
  global model, tokenizer
@@ -71,7 +64,7 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
71
  knowledge_base_name = str(int(time.time()))
72
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
73
  os.makedirs(this_cache_dir, exist_ok=True)
74
- global_image_md5s = []
75
 
76
  for pdf_file_path in pdf_file_list:
77
  with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
@@ -82,10 +75,11 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
82
 
83
  print(f"Processing {pdf_file_path}")
84
 
 
 
85
  dpi = 200
86
  doc = fitz.open(pdf_file_path)
87
 
88
- image_md5s = []
89
  reps_list = []
90
  images = []
91
 
@@ -93,8 +87,6 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
93
  # with self.lock: # because we hope one 16G gpu only process one image at the same time
94
  pix = page.get_pixmap(dpi=dpi)
95
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
96
- image_md5 = get_image_md5(image)
97
- image_md5s.append(image_md5)
98
  with torch.no_grad():
99
  reps = encode([image])
100
  reps_list.append(reps)
@@ -102,17 +94,14 @@ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
102
 
103
  for idx in range(len(images)):
104
  image = images[idx]
105
- image_md5 = image_md5s[idx]
106
- cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
107
  image.save(cache_image_path)
 
108
 
109
- np.save(os.path.join(this_cache_dir, f"{os.path.basename(pdf_file_path).split('.')[0]}.npy"), reps_list)
110
-
111
- global_image_md5s.extend(image_md5s)
112
 
113
- with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
114
- for item in global_image_md5s:
115
- f.write(item+'\n')
116
 
117
  return knowledge_base_name
118
 
@@ -127,10 +116,8 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
127
  if not os.path.exists(target_cache_dir):
128
  return None
129
 
130
- md5s = []
131
- with open(os.path.join(target_cache_dir, f"md5s.txt"), 'r') as f:
132
- for line in f:
133
- md5s.append(line.rstrip('\n'))
134
 
135
  doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
136
  doc_list = sorted(doc_list)
@@ -155,14 +142,14 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
155
 
156
  similarities_np = similarities.cpu().numpy()
157
  print(f"topk_doc_ids_np: {topk_doc_ids_np}, topk_values_np: {topk_values_np}")
158
- images_topk = [Image.open(os.path.join(target_cache_dir, f"{md5s[idx]}.png")) for idx in topk_doc_ids_np]
159
 
160
  with open(os.path.join(target_cache_dir, f"q-{query_md5}.json"), 'w') as f:
161
  f.write(json.dumps(
162
  {
163
  "knowledge_base": knowledge_base,
164
  "query": query,
165
- "retrived_docs": [os.path.join(target_cache_dir, f"{md5s[idx]}.png") for idx in topk_doc_ids_np]
166
  }, indent=4, ensure_ascii=False
167
  ))
168
 
 
52
  embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
53
  return embeddings
54
 
 
 
 
 
 
 
 
55
  @spaces.GPU
56
  def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
57
  global model, tokenizer
 
64
  knowledge_base_name = str(int(time.time()))
65
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
66
  os.makedirs(this_cache_dir, exist_ok=True)
67
+ index2img_filename = []
68
 
69
  for pdf_file_path in pdf_file_list:
70
  with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
 
75
 
76
  print(f"Processing {pdf_file_path}")
77
 
78
+ pdf_name = os.path.basename(pdf_file_path)
79
+
80
  dpi = 200
81
  doc = fitz.open(pdf_file_path)
82
 
 
83
  reps_list = []
84
  images = []
85
 
 
87
  # with self.lock: # because we hope one 16G gpu only process one image at the same time
88
  pix = page.get_pixmap(dpi=dpi)
89
  image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
 
 
90
  with torch.no_grad():
91
  reps = encode([image])
92
  reps_list.append(reps)
 
94
 
95
  for idx in range(len(images)):
96
  image = images[idx]
97
+ cache_image_path = os.path.join(this_cache_dir, f"{pdf_name}_{idx}.png")
 
98
  image.save(cache_image_path)
99
+ index2img_filename.append(os.path.basename(cache_image_path))
100
 
101
+ np.save(os.path.join(this_cache_dir, f"{pdf_name.split('.')[0]}.npy"), reps_list)
 
 
102
 
103
+ with open(os.path.join(this_cache_dir, f"index2img_filename.txt"), 'w') as f:
104
+ f.write('\n'.join(index2img_filename))
 
105
 
106
  return knowledge_base_name
107
 
 
116
  if not os.path.exists(target_cache_dir):
117
  return None
118
 
119
+ with open(os.path.join(target_cache_dir, f"index2img_filename.txt"), 'r') as f:
120
+ index2img_filename = f.read().split('\n')
 
 
121
 
122
  doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
123
  doc_list = sorted(doc_list)
 
142
 
143
  similarities_np = similarities.cpu().numpy()
144
  print(f"topk_doc_ids_np: {topk_doc_ids_np}, topk_values_np: {topk_values_np}")
145
+ images_topk = [Image.open(os.path.join(target_cache_dir, f"{index2img_filename[idx]}.png")) for idx in topk_doc_ids_np]
146
 
147
  with open(os.path.join(target_cache_dir, f"q-{query_md5}.json"), 'w') as f:
148
  f.write(json.dumps(
149
  {
150
  "knowledge_base": knowledge_base,
151
  "query": query,
152
+ "retrived_docs": [os.path.join(target_cache_dir, f"{index2img_filename[idx]}.png") for idx in topk_doc_ids_np]
153
  }, indent=4, ensure_ascii=False
154
  ))
155
 
car_owner_manual.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e0ee68f14306f3050e0729ef0c19988fc1f501ba4b81ad35aa2b254086bac38
3
+ size 12360551