from pprint import pprint
from PyPDF2 import PdfReader
import gradio as gr
from transformers import pipeline
import os

# Function to read PDF file content directly
def read_pdf(pdf_path):
    content = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        content += page.extract_text()
    return content

# Process and retrieve answers
def process_invoice(file, questions):
    try:
        # Read the PDF content directly
        print("Reading PDF content...")
        pdf_content = read_pdf(file.name)
        print(f"PDF Content: {pdf_content[:500]}...")  # Print first 500 characters for verification

        # Get the Hugging Face token from environment variables
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            raise ValueError("Hugging Face token not found in environment variables.")

        # Initialize the Hugging Face pipeline
        print("Initializing the Hugging Face pipeline...")
        qa_pipeline = pipeline("question-answering", model="mistralai/Mixtral-8x7B-Instruct-v0.1", use_auth_token=hf_token)

        answers = {}
        for question in questions.split(','):
            print(f"Asking question: {question.strip()}")
            result = qa_pipeline(question=question.strip(), context=pdf_content)
            answers[question] = result['answer']
            print(f"Answer: {result['answer']}")

        return answers
    except Exception as e:
        print(f"Error: {e}")
        return {"error": str(e)}

# Gradio interface
def gradio_interface(file, questions):
    answers = process_invoice(file, questions)
    return answers

interface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.File(file_count="single", label="Upload Invoice (PDF)"),
        gr.Textbox(lines=5, placeholder="Enter your questions separated by commas")
    ],
    outputs="json",
    title="Invoice Data Extraction",
    description="Upload an invoice PDF and get the extracted data based on your questions."
)

if __name__ == "__main__":
    interface.launch()