import streamlit as st
from fpdf import FPDF
import PyPDF2
import pytesseract
from pdf2image import convert_from_path
from transformers import pipeline
import re
import io

# Load pre-trained model for question-answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Extract text from PDF (text-based and image-based)
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def extract_text_from_image_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ''
    for image in images:
        text += pytesseract.image_to_string(image)
    return text

# Process the extracted text
def preprocess_text(text):
    text = re.sub(r'\s+', ' ', text)  # Clean up spaces
    text = re.sub(r'\n', ' ', text)  # Clean up newlines
    return text.strip()

# Grading function using the question-answering model
def grade_answer(question, student_answer):
    result = qa_pipeline(question=question, context=student_answer)
    answer_score = result['score']
    if answer_score > 0.5:
        return answer_score, "Correct"
    else:
        return answer_score, "Incorrect"

# Function to extract student name from text
def extract_student_name(text):
    match = re.search(r"Name\s*[:|-]?\s*([\w\s]+)", text)
    if match:
        return match.group(1).strip()
    return "Unknown Student"

# Function to extract questions from the text
def extract_questions_from_text(text):
    # Improved logic: extract sentences ending with '?' or "Question: [text]"
    questions = re.findall(r'(Question\s*[:|-]?\s*[\w\s\?]+)', text)  # Extract questions starting with "Question:"
    questions += re.findall(r'([^.]*\?)', text)  # Also extract any sentence ending with "?"
    
    # Remove duplicates and metadata like 'Name', 'Roll No', etc.
    questions = list(set(questions))  # Remove duplicates
    questions = [q for q in questions if not any(keyword in q.lower() for keyword in ['name', 'roll no', 'school'])]
    
    return questions

# Streamlit Interface
st.title('Student Answer Grading System')
st.write('Upload a PDF containing student details and their answers.')

# Upload file
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
    # Save uploaded file temporarily
    with open("uploaded_file.pdf", "wb") as f:
        f.write(uploaded_file.getbuffer())

    # Extract text from the uploaded PDF
    text = extract_text_from_pdf("uploaded_file.pdf")
    if not text:  # If no text extracted, try OCR
        text = extract_text_from_image_pdf("uploaded_file.pdf")

    # Print the extracted text to manually check what was extracted (optional)
    st.subheader("Extracted Text:")
    st.text(text)

    # Preprocess text
    preprocessed_text = preprocess_text(text)

    # Extract student name and questions
    student_name = extract_student_name(text)
    questions = extract_questions_from_text(text)

    # Display student name
    st.subheader(f"Student Name: {student_name}")

    # Results
    results = {}
    for question in questions:
        score, feedback = grade_answer(question, preprocessed_text)
        results[question] = {"score": score, "feedback": feedback}

    # Display results
    for question, result in results.items():
        st.write(f"**Question**: {question}")
        st.write(f"**Score**: {result['score']:.2f}")
        st.write(f"**Feedback**: {result['feedback']}")
        st.write("---")