from pprint import pprint from getpass import getpass from PyPDF2 import PdfReader import gradio as gr import os from transformers import pipeline # Function to read PDF file content directly def read_pdf(pdf_path): content = "" reader = PdfReader(pdf_path) for page in reader.pages: content += page.extract_text() return content # Process and retrieve answers def process_invoice(file, hf_token, questions): # Read the PDF content directly pdf_content = read_pdf(file.name) # Initialize the Hugging Face pipeline qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", token=hf_token) answers = {} for question in questions.split(','): result = qa_pipeline(question=question.strip(), context=pdf_content) answers[question] = result['answer'] return answers # Gradio interface def gradio_interface(file, hf_token, questions): answers = process_invoice(file, hf_token, questions) return answers interface = gr.Interface( fn=gradio_interface, inputs=[ gr.inputs.File(file_count="single", type="file", label="Upload Invoice (PDF)"), gr.inputs.Textbox(type="password", label="Enter your Hugging Face Token"), gr.inputs.Textbox(lines=5, placeholder="Enter your questions separated by commas") ], outputs="json", title="Invoice Data Extraction", description="Upload an invoice PDF, provide your Hugging Face token, and get the extracted data based on your questions." ) if __name__ == "__main__": interface.launch()