File size: 2,047 Bytes
ddb0d1e ac61621 ddb0d1e 86d3138 0a55ae4 ddb0d1e ac61621 d22351a ddb0d1e aa71834 954da24 aa71834 954da24 ddb0d1e aa71834 ddb0d1e bf9d3cd ddb0d1e aa71834 ddb0d1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
from pprint import pprint
from PyPDF2 import PdfReader
import gradio as gr
from transformers import pipeline
import os
# Function to read PDF file content directly
def read_pdf(pdf_path):
content = ""
reader = PdfReader(pdf_path)
for page in reader.pages:
content += page.extract_text()
return content
# Process and retrieve answers
def process_invoice(file, questions):
try:
# Read the PDF content directly
print("Reading PDF content...")
pdf_content = read_pdf(file.name)
print(f"PDF Content: {pdf_content[:500]}...") # Print first 500 characters for verification
# Get the Hugging Face token from environment variables
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError("Hugging Face token not found in environment variables.")
# Initialize the Hugging Face pipeline
print("Initializing the Hugging Face pipeline...")
qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token=hf_token)
answers = {}
for question in questions.split(','):
print(f"Asking question: {question.strip()}")
result = qa_pipeline(question=question.strip(), context=pdf_content)
answers[question] = result['answer']
print(f"Answer: {result['answer']}")
return answers
except Exception as e:
print(f"Error: {e}")
return {"error": str(e)}
# Gradio interface
def gradio_interface(file, questions):
answers = process_invoice(file, questions)
return answers
interface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.File(file_count="single", label="Upload Invoice (PDF)"),
gr.Textbox(lines=5, placeholder="Enter your questions separated by commas")
],
outputs="json",
title="Invoice Data Extraction",
description="Upload an invoice PDF and get the extracted data based on your questions."
)
if __name__ == "__main__":
interface.launch()
|