Spaces:
Runtime error
Runtime error
Upload main.py
Browse files
main.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
|
2 |
from pydantic import BaseModel
|
3 |
-
import fitz # PyMuPDF
|
4 |
-
import tempfile
|
5 |
-
import os
|
6 |
-
import json
|
7 |
-
import logging
|
8 |
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
9 |
|
10 |
# Initialize FastAPI app and logging
|
@@ -14,10 +11,10 @@ logger = logging.getLogger(__name__)
|
|
14 |
|
15 |
# Load model and tokenizer
|
16 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
|
17 |
-
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
|
18 |
-
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1) #
|
19 |
|
20 |
-
# Define response model
|
21 |
class Education(BaseModel):
|
22 |
degree: str
|
23 |
university: str
|
@@ -33,18 +30,18 @@ class ExtractedInfo(BaseModel):
|
|
33 |
good_conduct_certificate: str
|
34 |
|
35 |
def extract_text_from_pdf(pdf_path: str) -> str:
|
36 |
-
"""Extracts text from
|
37 |
with fitz.open(pdf_path) as doc:
|
38 |
return "".join(page.get_text() for page in doc).strip()
|
39 |
|
40 |
def chunk_text(text: str, max_tokens: int = 512) -> list:
|
41 |
-
"""
|
42 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
43 |
return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
|
44 |
for i in range(0, len(tokens), max_tokens)]
|
45 |
|
46 |
def process_chunk(chunk: str) -> dict:
|
47 |
-
"""Uses the model to
|
48 |
prompt = f"""
|
49 |
Extract the following information in JSON format:
|
50 |
{{
|
@@ -62,12 +59,16 @@ def process_chunk(chunk: str) -> dict:
|
|
62 |
}}
|
63 |
Resume text: {chunk}
|
64 |
"""
|
65 |
-
|
66 |
-
generated_text = response[0].get("generated_text", "")
|
67 |
try:
|
68 |
-
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
71 |
return {}
|
72 |
|
73 |
def merge_outputs(chunks: list) -> dict:
|
@@ -81,31 +82,36 @@ def merge_outputs(chunks: list) -> dict:
|
|
81 |
"hse_description": "",
|
82 |
"good_conduct_certificate": ""
|
83 |
}
|
|
|
84 |
for chunk in chunks:
|
85 |
-
|
|
|
86 |
if isinstance(value, dict):
|
87 |
merged[key].update(value)
|
88 |
elif not merged[key]:
|
89 |
merged[key] = value
|
|
|
90 |
return merged
|
91 |
|
92 |
@app.post("/process_cv/", response_model=ExtractedInfo)
|
93 |
async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
|
94 |
-
"""Processes a PDF resume and
|
95 |
if not file.filename.endswith(".pdf"):
|
96 |
-
raise HTTPException(status_code=400, detail="Only PDF files are allowed")
|
97 |
|
98 |
temp_path = tempfile.mktemp(suffix=".pdf")
|
99 |
with open(temp_path, "wb") as f:
|
100 |
f.write(await file.read())
|
101 |
|
102 |
try:
|
|
|
103 |
text = extract_text_from_pdf(temp_path)
|
104 |
if not text:
|
105 |
-
raise HTTPException(status_code=400, detail="No extractable text found in PDF.")
|
106 |
|
|
|
107 |
chunks = chunk_text(text)
|
108 |
-
structured_data = merge_outputs(
|
109 |
return ExtractedInfo(**structured_data)
|
110 |
finally:
|
111 |
os.remove(temp_path)
|
|
|
1 |
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
|
2 |
from pydantic import BaseModel
|
3 |
+
import fitz # PyMuPDF
|
4 |
+
import tempfile, os, json, logging
|
|
|
|
|
|
|
5 |
from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
|
6 |
|
7 |
# Initialize FastAPI app and logging
|
|
|
11 |
|
12 |
# Load model and tokenizer
|
13 |
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
|
14 |
+
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False)
|
15 |
+
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1) # CPU-based inference
|
16 |
|
17 |
+
# Define the structured response model
|
18 |
class Education(BaseModel):
|
19 |
degree: str
|
20 |
university: str
|
|
|
30 |
good_conduct_certificate: str
|
31 |
|
32 |
def extract_text_from_pdf(pdf_path: str) -> str:
|
33 |
+
"""Extracts text from the uploaded PDF."""
|
34 |
with fitz.open(pdf_path) as doc:
|
35 |
return "".join(page.get_text() for page in doc).strip()
|
36 |
|
37 |
def chunk_text(text: str, max_tokens: int = 512) -> list:
|
38 |
+
"""Splits the text into manageable chunks that fit within the token limit."""
|
39 |
tokens = tokenizer.encode(text, add_special_tokens=False)
|
40 |
return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
|
41 |
for i in range(0, len(tokens), max_tokens)]
|
42 |
|
43 |
def process_chunk(chunk: str) -> dict:
|
44 |
+
"""Uses the model to extract structured JSON data from a text chunk."""
|
45 |
prompt = f"""
|
46 |
Extract the following information in JSON format:
|
47 |
{{
|
|
|
59 |
}}
|
60 |
Resume text: {chunk}
|
61 |
"""
|
62 |
+
|
|
|
63 |
try:
|
64 |
+
response = qa_pipeline(prompt, max_new_tokens=150)
|
65 |
+
generated_text = response[0]["generated_text"]
|
66 |
+
# Extract JSON from the generated text
|
67 |
+
json_start = generated_text.find("{")
|
68 |
+
json_end = generated_text.rfind("}") + 1
|
69 |
+
return json.loads(generated_text[json_start:json_end])
|
70 |
+
except (json.JSONDecodeError, IndexError) as e:
|
71 |
+
logger.warning(f"Failed to parse JSON: {e}")
|
72 |
return {}
|
73 |
|
74 |
def merge_outputs(chunks: list) -> dict:
|
|
|
82 |
"hse_description": "",
|
83 |
"good_conduct_certificate": ""
|
84 |
}
|
85 |
+
|
86 |
for chunk in chunks:
|
87 |
+
chunk_output = process_chunk(chunk)
|
88 |
+
for key, value in chunk_output.items():
|
89 |
if isinstance(value, dict):
|
90 |
merged[key].update(value)
|
91 |
elif not merged[key]:
|
92 |
merged[key] = value
|
93 |
+
|
94 |
return merged
|
95 |
|
96 |
@app.post("/process_cv/", response_model=ExtractedInfo)
|
97 |
async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
|
98 |
+
"""Processes a PDF resume and returns structured information in JSON format."""
|
99 |
if not file.filename.endswith(".pdf"):
|
100 |
+
raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
|
101 |
|
102 |
temp_path = tempfile.mktemp(suffix=".pdf")
|
103 |
with open(temp_path, "wb") as f:
|
104 |
f.write(await file.read())
|
105 |
|
106 |
try:
|
107 |
+
# Extract text from the uploaded PDF
|
108 |
text = extract_text_from_pdf(temp_path)
|
109 |
if not text:
|
110 |
+
raise HTTPException(status_code=400, detail="No extractable text found in the PDF.")
|
111 |
|
112 |
+
# Process the text in chunks and merge the output
|
113 |
chunks = chunk_text(text)
|
114 |
+
structured_data = merge_outputs(chunks)
|
115 |
return ExtractedInfo(**structured_data)
|
116 |
finally:
|
117 |
os.remove(temp_path)
|