Spaces:

Anupam251272
/

BharatiQA

Running

File size: 4,776 Bytes

aaa1854
 
 
 
 
 
 
 
 
 
 
ca687eb
aaa1854
ca687eb
 
aaa1854

import os
import logging
import tempfile
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import gradio as gr
import fitz  # PyMuPDF
import requests
from PIL import Image
import pytesseract
from langid import langid
from deep_translator import GoogleTranslator
import torch  # Add this import

logging.basicConfig(level=logging.INFO)
device = 0 if torch.cuda.is_available() else -1
logging.basicConfig(level=logging.INFO)
device = 0 if torch.cuda.is_available() else -1

# Initialize multilingual QA pipeline
model_name = "mrm8488/bert-multi-cased-finetuned-xquadv1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

INDIAN_LANGUAGES = {
    'hi': 'Hindi',
    'pa': 'Punjabi',
    'bn': 'Bengali',
    'gu': 'Gujarati',
    'mr': 'Marathi',
    'ta': 'Tamil',
    'te': 'Telugu',
    'kn': 'Kannada',
    'ml': 'Malayalam',
    'en': 'English'
}

def download_pdf_from_url(url):
    try:
        response = requests.get(url)
        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
            temp_pdf.write(response.content)
            return temp_pdf.name
    except Exception as e:
        logging.error(f"Error downloading PDF: {e}")
        return None

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text("text") or ""

        if not text.strip():
            images = []
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                images.append(img)

            for image in images:
                ocr_text = pytesseract.image_to_string(
                    image,
                    lang='+'.join(['eng', 'hin', 'pan', 'ben', 'guj', 'mar', 'tam', 'tel', 'kan', 'mal'])
                )
                text += ocr_text
    except Exception as e:
        logging.error(f"Error extracting text: {e}")
    return text

def detect_language(text):
    if not text.strip():
        return 'en'
    try:
        lang_code, _ = langid.classify(text)
        if lang_code in INDIAN_LANGUAGES:
            return lang_code
        else:
            return 'en'
    except Exception as e:
        logging.error(f"Language detection error: {e}")
        return 'en'

def process_qa(question, context, output_lang):
    try:
        result = qa_pipeline(question=question, context=context)
        answer = result['answer']

        # Translate answer to the specified output language
        if output_lang != 'en':
            answer = GoogleTranslator(source='en', target=output_lang).translate(answer)

        return answer
    except Exception as e:
        logging.error(f"QA processing error: {e}")
        return str(e)

def analyze_input(input_source, question, output_lang):
    try:
        if isinstance(input_source, str) and input_source.startswith(('http://', 'https://')):
            pdf_path = download_pdf_from_url(input_source)
        else:
            pdf_path = input_source.name

        if not pdf_path:
            return "Error: Invalid input source"

        text = extract_text_from_pdf(pdf_path)
        if not text.strip():
            return "No text extracted from document"

        question_lang = detect_language(question)
        logging.info(f"Detected question language: {question_lang}")

        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
        answers = [process_qa(question, chunk, output_lang) for chunk in chunks if chunk.strip()]

        final_answer = " ".join(filter(None, answers))
        return f"Answer ({INDIAN_LANGUAGES.get(output_lang, 'English')}): {final_answer}"

    except Exception as e:
        logging.error(f"Analysis error: {e}")
        return f"Error: {str(e)}"

# Gradio Interface
def create_interface():
    output_lang_list = list(INDIAN_LANGUAGES.keys())
    return gr.Interface(
        fn=analyze_input,
        inputs=[
            gr.File(label="Upload PDF or Enter PDF URL"),
            gr.Textbox(label="Enter your question"),
            gr.Dropdown(choices=output_lang_list, label="Select Output Language", value='en')
        ],
        outputs="text",
        title="Indian Languages PDF QA System",
        description="Support for Hindi, Punjabi, Bengali, Gujarati, Marathi, Tamil, Telugu, Kannada, Malayalam, and English"
    )

if __name__ == "__main__":
    interface = create_interface()
    interface.launch()