Multimodal-PDF-RAG / utils.py
anand004's picture
bug fixes and improvement
e014b81 unverified
raw
history blame
1.39 kB
import pymupdf
from PIL import Image
import io
import gradio as gr
import base64
import pandas as pd
import pymupdf
def image_to_bytes(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
def extract_pdfs(docs, doc_collection):
if docs:
doc_collection = []
doc_collection.extend(docs)
return (
doc_collection,
gr.Tabs(selected=1),
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
)
def extract_images(docs):
images = []
for doc_path in docs:
doc = pymupdf.open(doc_path)
for page_index in range(len(doc)):
page = doc[page_index]
image_list = page.get_images()
for _, img in enumerate(image_list, start=1):
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
if pix.n - pix.alpha > 3:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
return images
def clean_text(text):
text = text.strip()
cleaned_text = text.replace("\n", " ")
cleaned_text = cleaned_text.replace("\t", " ")
cleaned_text = cleaned_text.replace(" ", " ")
cleaned_text = cleaned_text.strip()
return cleaned_text