bokesyo commited on
Commit
20f7fea
1 Parent(s): 378eecb

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +5 -5
pipeline.py CHANGED
@@ -22,7 +22,7 @@ def get_image_md5(img: Image.Image):
22
  hex_digest = hash_md5.hexdigest()
23
  return hex_digest
24
 
25
- def pdf_to_images(pdf_path, dpi=100):
26
  doc = fitz.open(pdf_path)
27
  images = []
28
  for page in tqdm.tqdm(doc):
@@ -67,7 +67,7 @@ class PDFVisualRetrieval:
67
  images_topk = [all_images_doc_list[idx] for idx in topk_doc_ids_np]
68
  return topk_doc_ids_np, topk_values_np, images_topk
69
 
70
- def add_pdf(self, knowledge_base_name: str, pdf_file_path: str, dpi: int = 100):
71
  print("[1/2] rendering pdf to images..")
72
  images = pdf_to_images(pdf_file_path, dpi=dpi)
73
  print("[2/2] model encoding images..")
@@ -94,10 +94,10 @@ if __name__ == "__main__":
94
  retriever = PDFVisualRetrieval(model=model, tokenizer=tokenizer)
95
  retriever.add_pdf('test', pdf_path)
96
 
97
- topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='what is the number of VQ of this kind of codec method?', topk=1)
98
  # 2
99
- topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the training loss curve of this paper?', topk=1)
100
  # 3
101
- topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the experiment table?', topk=1)
102
  # 2
103
 
 
22
  hex_digest = hash_md5.hexdigest()
23
  return hex_digest
24
 
25
+ def pdf_to_images(pdf_path, dpi=200):
26
  doc = fitz.open(pdf_path)
27
  images = []
28
  for page in tqdm.tqdm(doc):
 
67
  images_topk = [all_images_doc_list[idx] for idx in topk_doc_ids_np]
68
  return topk_doc_ids_np, topk_values_np, images_topk
69
 
70
+ def add_pdf(self, knowledge_base_name: str, pdf_file_path: str, dpi: int = 200):
71
  print("[1/2] rendering pdf to images..")
72
  images = pdf_to_images(pdf_file_path, dpi=dpi)
73
  print("[2/2] model encoding images..")
 
94
  retriever = PDFVisualRetrieval(model=model, tokenizer=tokenizer)
95
  retriever.add_pdf('test', pdf_path)
96
 
97
+ topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='what is the number of VQ of this kind of codec method?', topk=5)
98
  # 2
99
+ topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the training loss curve of this paper?', topk=5)
100
  # 3
101
+ topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the experiment table?', topk=5)
102
  # 2
103