andreeabodea
/

Extraction

Model card Files Files and versions Community

andreeabodea commited on Apr 11, 2024

Commit

e993c2b

·

verified ·

1 Parent(s): c95fb59

Create app.py

Files changed (1) hide show

app.py +50 -0

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+import pdfplumber
+from transformers import pipeline
+from io import BytesIO
+import re
+# Initialize the question-answering pipeline with a specific pre-trained model
+qa_pipeline = pipeline("question-answering", model="deepset/gelectra-large-germanquad")
+def extract_text_from_pdf(file_obj):
+    """Extracts text from a PDF file."""
+    text = []
+    with pdfplumber.open(file_obj) as pdf:
+        for page in pdf.pages:
+            page_text = page.extract_text()
+            if page_text:  # Make sure there's text on the page
+                text.append(page_text)
+    return " ".join(text)
+def answer_questions(context):
+    """Generates answers to predefined questions based on the provided context."""
+    questions = [
+        "Welches ist das Titel des Moduls?",
+        "Welches ist das Sektor oder das Kernthema?",
+        "Welches ist das Land?",
+        "Zu welchem Program oder EZ-Programm gehört das Projekt?"
+    ]
+    answers = {q: qa_pipeline(question=q, context=context)['answer'] for q in questions}
+    return answers
+def process_pdf(file):
+    """Process a PDF file to extract text and then use the text to answer questions."""
+    # Read the PDF file from Gradio's file input, which is a temporary file path
+    with file as file_path:
+        text = extract_text_from_pdf(BytesIO(file_path.read()))
+        results = answer_questions(text)
+        return "\n".join(f"{q}: {a}" for q, a in results.items())
+# Define the Gradio interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=gr.inputs.File(type="pdf", label="Upload your PDF file"),
+    outputs=gr.outputs.Textbox(label="Extracted Information and Answers"),
+    title="PDF Text Extractor and Question Answerer",
+    description="Upload a PDF file to extract text and answer predefined questions based on the content."
+)
+if __name__ == "__main__":
+    iface.launch()