Multimodal-PDF-RAG / utils.py
anand004's picture
bug fixes and improvement
e014b81 unverified
import pymupdf
from PIL import Image
import io
import gradio as gr
import base64
import pandas as pd
import pymupdf
def image_to_bytes(image):
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format="PNG")
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
def extract_pdfs(docs, doc_collection):
if docs:
doc_collection = []
doc_collection.extend(docs)
return (
doc_collection,
gr.Tabs(selected=1),
pd.DataFrame([i.split("/")[-1] for i in list(docs)], columns=["Filename"]),
)
def extract_images(docs):
images = []
for doc_path in docs:
doc = pymupdf.open(doc_path)
for page_index in range(len(doc)):
page = doc[page_index]
image_list = page.get_images()
for _, img in enumerate(image_list, start=1):
xref = img[0]
pix = pymupdf.Pixmap(doc, xref)
if pix.n - pix.alpha > 3:
pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
images.append(Image.open(io.BytesIO(pix.pil_tobytes("JPEG"))))
return images
def clean_text(text):
text = text.strip()
cleaned_text = text.replace("\n", " ")
cleaned_text = cleaned_text.replace("\t", " ")
cleaned_text = cleaned_text.replace(" ", " ")
cleaned_text = cleaned_text.strip()
return cleaned_text