indemo / app.py
muradkhan's picture
Update app.py
aa71834 verified
raw
history blame
2.05 kB
from pprint import pprint
from PyPDF2 import PdfReader
import gradio as gr
from transformers import pipeline
import os
# Function to read PDF file content directly
def read_pdf(pdf_path):
content = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
content += page.extract_text()
return content
# Process and retrieve answers
def process_invoice(file, questions):
try:
# Read the PDF content directly
print("Reading PDF content...")
pdf_content = read_pdf(file.name)
print(f"PDF Content: {pdf_content[:500]}...") # Print first 500 characters for verification
# Get the Hugging Face token from environment variables
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("Hugging Face token not found in environment variables.")
# Initialize the Hugging Face pipeline
print("Initializing the Hugging Face pipeline...")
qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token=hf_token)
answers = {}
for question in questions.split(','):
print(f"Asking question: {question.strip()}")
result = qa_pipeline(question=question.strip(), context=pdf_content)
answers[question] = result['answer']
print(f"Answer: {result['answer']}")
return answers
except Exception as e:
print(f"Error: {e}")
return {"error": str(e)}
# Gradio interface
def gradio_interface(file, questions):
answers = process_invoice(file, questions)
return answers
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(file_count="single", label="Upload Invoice (PDF)"),
gr.Textbox(lines=5, placeholder="Enter your questions separated by commas")
],
outputs="json",
title="Invoice Data Extraction",
description="Upload an invoice PDF and get the extracted data based on your questions."
)
if __name__ == "__main__":
interface.launch()