from pprint import pprint from PyPDF2 import PdfReader import gradio as gr from transformers import pipeline import os # Function to read PDF file content directly def read_pdf(pdf_path): content = "" reader = PdfReader(pdf_path) for page in reader.pages: content += page.extract_text() return content # Process and retrieve answers def process_invoice(file, questions): try: # Read the PDF content directly print("Reading PDF content...") pdf_content = read_pdf(file.name) print(f"PDF Content: {pdf_content[:500]}...") # Print first 500 characters for verification # Get the Hugging Face token from environment variables hf_token = os.getenv("HF_TOKEN") if not hf_token: raise ValueError("Hugging Face token not found in environment variables.") # Initialize the Hugging Face pipeline print("Initializing the Hugging Face pipeline...") qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token=hf_token) answers = {} for question in questions.split(','): print(f"Asking question: {question.strip()}") result = qa_pipeline(question=question.strip(), context=pdf_content) answers[question] = result['answer'] print(f"Answer: {result['answer']}") return answers except Exception as e: print(f"Error: {e}") return {"error": str(e)} # Gradio interface def gradio_interface(file, questions): answers = process_invoice(file, questions) return answers interface = gr.Interface( fn=gradio_interface, inputs=[ gr.File(file_count="single", label="Upload Invoice (PDF)"), gr.Textbox(lines=5, placeholder="Enter your questions separated by commas") ], outputs="json", title="Invoice Data Extraction", description="Upload an invoice PDF and get the extracted data based on your questions." ) if __name__ == "__main__": interface.launch()