Spaces:

sinan7
/

extract_api

Runtime error

App Files Files Community

sinan7 commited on Oct 15, 2024

Commit

94a35f8

verified ·

1 Parent(s): 176cd0a

Upload main.py

Browse files

Files changed (1) hide show

main.py +27 -21

main.py CHANGED Viewed

@@ -1,10 +1,7 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
 from pydantic import BaseModel
-import fitz  # PyMuPDF for PDF handling
-import tempfile
-import os
-import json
-import logging
 from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 # Initialize FastAPI app and logging
@@ -14,10 +11,10 @@ logger = logging.getLogger(__name__)
 # Load model and tokenizer
 model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
-tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
-qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)  # Use CPU
-# Define response model for structured data
 class Education(BaseModel):
     degree: str
     university: str
@@ -33,18 +30,18 @@ class ExtractedInfo(BaseModel):
     good_conduct_certificate: str
 def extract_text_from_pdf(pdf_path: str) -> str:
-    """Extracts text from a PDF."""
     with fitz.open(pdf_path) as doc:
         return "".join(page.get_text() for page in doc).strip()
 def chunk_text(text: str, max_tokens: int = 512) -> list:
-    """Breaks text into manageable chunks within token limits."""
     tokens = tokenizer.encode(text, add_special_tokens=False)
     return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
             for i in range(0, len(tokens), max_tokens)]
 def process_chunk(chunk: str) -> dict:
-    """Uses the model to generate structured JSON data from a text chunk."""
     prompt = f"""
     Extract the following information in JSON format:
     {{
@@ -62,12 +59,16 @@ def process_chunk(chunk: str) -> dict:
     }}
     Resume text: {chunk}
     """
-    response = qa_pipeline(prompt, max_new_tokens=150)
-    generated_text = response[0].get("generated_text", "")
     try:
-        return json.loads(generated_text[generated_text.find("{"):generated_text.rfind("}") + 1])
-    except json.JSONDecodeError as e:
-        logger.warning(f"JSON parsing failed: {e}")
         return {}
 def merge_outputs(chunks: list) -> dict:
@@ -81,31 +82,36 @@ def merge_outputs(chunks: list) -> dict:
         "hse_description": "",
         "good_conduct_certificate": ""
     }
     for chunk in chunks:
-        for key, value in chunk.items():
             if isinstance(value, dict):
                 merged[key].update(value)
             elif not merged[key]:
                 merged[key] = value
     return merged
 @app.post("/process_cv/", response_model=ExtractedInfo)
 async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
-    """Processes a PDF resume and extracts structured information."""
     if not file.filename.endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="Only PDF files are allowed")
     temp_path = tempfile.mktemp(suffix=".pdf")
     with open(temp_path, "wb") as f:
         f.write(await file.read())
     try:
         text = extract_text_from_pdf(temp_path)
         if not text:
-            raise HTTPException(status_code=400, detail="No extractable text found in PDF.")
         chunks = chunk_text(text)
-        structured_data = merge_outputs([process_chunk(chunk) for chunk in chunks])
         return ExtractedInfo(**structured_data)
     finally:
         os.remove(temp_path)

 from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
 from pydantic import BaseModel
+import fitz  # PyMuPDF
+import tempfile, os, json, logging
 from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
 # Initialize FastAPI app and logging
 # Load model and tokenizer
 model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
+tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False)
+qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1)  # CPU-based inference
+# Define the structured response model
 class Education(BaseModel):
     degree: str
     university: str
     good_conduct_certificate: str
 def extract_text_from_pdf(pdf_path: str) -> str:
+    """Extracts text from the uploaded PDF."""
     with fitz.open(pdf_path) as doc:
         return "".join(page.get_text() for page in doc).strip()
 def chunk_text(text: str, max_tokens: int = 512) -> list:
+    """Splits the text into manageable chunks that fit within the token limit."""
     tokens = tokenizer.encode(text, add_special_tokens=False)
     return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
             for i in range(0, len(tokens), max_tokens)]
 def process_chunk(chunk: str) -> dict:
+    """Uses the model to extract structured JSON data from a text chunk."""
     prompt = f"""
     Extract the following information in JSON format:
     {{
     }}
     Resume text: {chunk}
     """
     try:
+        response = qa_pipeline(prompt, max_new_tokens=150)
+        generated_text = response[0]["generated_text"]
+        # Extract JSON from the generated text
+        json_start = generated_text.find("{")
+        json_end = generated_text.rfind("}") + 1
+        return json.loads(generated_text[json_start:json_end])
+    except (json.JSONDecodeError, IndexError) as e:
+        logger.warning(f"Failed to parse JSON: {e}")
         return {}
 def merge_outputs(chunks: list) -> dict:
         "hse_description": "",
         "good_conduct_certificate": ""
     }
     for chunk in chunks:
+        chunk_output = process_chunk(chunk)
+        for key, value in chunk_output.items():
             if isinstance(value, dict):
                 merged[key].update(value)
             elif not merged[key]:
                 merged[key] = value
     return merged
 @app.post("/process_cv/", response_model=ExtractedInfo)
 async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
+    """Processes a PDF resume and returns structured information in JSON format."""
     if not file.filename.endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
     temp_path = tempfile.mktemp(suffix=".pdf")
     with open(temp_path, "wb") as f:
         f.write(await file.read())
     try:
+        # Extract text from the uploaded PDF
         text = extract_text_from_pdf(temp_path)
         if not text:
+            raise HTTPException(status_code=400, detail="No extractable text found in the PDF.")
+        # Process the text in chunks and merge the output
         chunks = chunk_text(text)
+        structured_data = merge_outputs(chunks)
         return ExtractedInfo(**structured_data)
     finally:
         os.remove(temp_path)