Update pipeline.py
Browse files- pipeline.py +5 -5
pipeline.py
CHANGED
@@ -22,7 +22,7 @@ def get_image_md5(img: Image.Image):
|
|
22 |
hex_digest = hash_md5.hexdigest()
|
23 |
return hex_digest
|
24 |
|
25 |
-
def pdf_to_images(pdf_path, dpi=
|
26 |
doc = fitz.open(pdf_path)
|
27 |
images = []
|
28 |
for page in tqdm.tqdm(doc):
|
@@ -67,7 +67,7 @@ class PDFVisualRetrieval:
|
|
67 |
images_topk = [all_images_doc_list[idx] for idx in topk_doc_ids_np]
|
68 |
return topk_doc_ids_np, topk_values_np, images_topk
|
69 |
|
70 |
-
def add_pdf(self, knowledge_base_name: str, pdf_file_path: str, dpi: int =
|
71 |
print("[1/2] rendering pdf to images..")
|
72 |
images = pdf_to_images(pdf_file_path, dpi=dpi)
|
73 |
print("[2/2] model encoding images..")
|
@@ -94,10 +94,10 @@ if __name__ == "__main__":
|
|
94 |
retriever = PDFVisualRetrieval(model=model, tokenizer=tokenizer)
|
95 |
retriever.add_pdf('test', pdf_path)
|
96 |
|
97 |
-
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='what is the number of VQ of this kind of codec method?', topk=
|
98 |
# 2
|
99 |
-
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the training loss curve of this paper?', topk=
|
100 |
# 3
|
101 |
-
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the experiment table?', topk=
|
102 |
# 2
|
103 |
|
|
|
22 |
hex_digest = hash_md5.hexdigest()
|
23 |
return hex_digest
|
24 |
|
25 |
+
def pdf_to_images(pdf_path, dpi=200):
|
26 |
doc = fitz.open(pdf_path)
|
27 |
images = []
|
28 |
for page in tqdm.tqdm(doc):
|
|
|
67 |
images_topk = [all_images_doc_list[idx] for idx in topk_doc_ids_np]
|
68 |
return topk_doc_ids_np, topk_values_np, images_topk
|
69 |
|
70 |
+
def add_pdf(self, knowledge_base_name: str, pdf_file_path: str, dpi: int = 200):
|
71 |
print("[1/2] rendering pdf to images..")
|
72 |
images = pdf_to_images(pdf_file_path, dpi=dpi)
|
73 |
print("[2/2] model encoding images..")
|
|
|
94 |
retriever = PDFVisualRetrieval(model=model, tokenizer=tokenizer)
|
95 |
retriever.add_pdf('test', pdf_path)
|
96 |
|
97 |
+
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='what is the number of VQ of this kind of codec method?', topk=5)
|
98 |
# 2
|
99 |
+
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the training loss curve of this paper?', topk=5)
|
100 |
# 3
|
101 |
+
topk_doc_ids_np, topk_values_np, images_topk = retriever.retrieve(knowledge_base='test', query='the experiment table?', topk=5)
|
102 |
# 2
|
103 |
|