sinan7 commited on
Commit
94a35f8
·
verified ·
1 Parent(s): 176cd0a

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +27 -21
main.py CHANGED
@@ -1,10 +1,7 @@
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
  from pydantic import BaseModel
3
- import fitz # PyMuPDF for PDF handling
4
- import tempfile
5
- import os
6
- import json
7
- import logging
8
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
9
 
10
  # Initialize FastAPI app and logging
@@ -14,10 +11,10 @@ logger = logging.getLogger(__name__)
14
 
15
  # Load model and tokenizer
16
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
17
- tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
18
- qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1) # Use CPU
19
 
20
- # Define response model for structured data
21
  class Education(BaseModel):
22
  degree: str
23
  university: str
@@ -33,18 +30,18 @@ class ExtractedInfo(BaseModel):
33
  good_conduct_certificate: str
34
 
35
  def extract_text_from_pdf(pdf_path: str) -> str:
36
- """Extracts text from a PDF."""
37
  with fitz.open(pdf_path) as doc:
38
  return "".join(page.get_text() for page in doc).strip()
39
 
40
  def chunk_text(text: str, max_tokens: int = 512) -> list:
41
- """Breaks text into manageable chunks within token limits."""
42
  tokens = tokenizer.encode(text, add_special_tokens=False)
43
  return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
44
  for i in range(0, len(tokens), max_tokens)]
45
 
46
  def process_chunk(chunk: str) -> dict:
47
- """Uses the model to generate structured JSON data from a text chunk."""
48
  prompt = f"""
49
  Extract the following information in JSON format:
50
  {{
@@ -62,12 +59,16 @@ def process_chunk(chunk: str) -> dict:
62
  }}
63
  Resume text: {chunk}
64
  """
65
- response = qa_pipeline(prompt, max_new_tokens=150)
66
- generated_text = response[0].get("generated_text", "")
67
  try:
68
- return json.loads(generated_text[generated_text.find("{"):generated_text.rfind("}") + 1])
69
- except json.JSONDecodeError as e:
70
- logger.warning(f"JSON parsing failed: {e}")
 
 
 
 
 
71
  return {}
72
 
73
  def merge_outputs(chunks: list) -> dict:
@@ -81,31 +82,36 @@ def merge_outputs(chunks: list) -> dict:
81
  "hse_description": "",
82
  "good_conduct_certificate": ""
83
  }
 
84
  for chunk in chunks:
85
- for key, value in chunk.items():
 
86
  if isinstance(value, dict):
87
  merged[key].update(value)
88
  elif not merged[key]:
89
  merged[key] = value
 
90
  return merged
91
 
92
  @app.post("/process_cv/", response_model=ExtractedInfo)
93
  async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
94
- """Processes a PDF resume and extracts structured information."""
95
  if not file.filename.endswith(".pdf"):
96
- raise HTTPException(status_code=400, detail="Only PDF files are allowed")
97
 
98
  temp_path = tempfile.mktemp(suffix=".pdf")
99
  with open(temp_path, "wb") as f:
100
  f.write(await file.read())
101
 
102
  try:
 
103
  text = extract_text_from_pdf(temp_path)
104
  if not text:
105
- raise HTTPException(status_code=400, detail="No extractable text found in PDF.")
106
 
 
107
  chunks = chunk_text(text)
108
- structured_data = merge_outputs([process_chunk(chunk) for chunk in chunks])
109
  return ExtractedInfo(**structured_data)
110
  finally:
111
  os.remove(temp_path)
 
1
  from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
2
  from pydantic import BaseModel
3
+ import fitz # PyMuPDF
4
+ import tempfile, os, json, logging
 
 
 
5
  from transformers import pipeline, T5Tokenizer, T5ForConditionalGeneration
6
 
7
  # Initialize FastAPI app and logging
 
11
 
12
  # Load model and tokenizer
13
  model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
14
+ tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base", legacy=False)
15
+ qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=-1) # CPU-based inference
16
 
17
+ # Define the structured response model
18
  class Education(BaseModel):
19
  degree: str
20
  university: str
 
30
  good_conduct_certificate: str
31
 
32
  def extract_text_from_pdf(pdf_path: str) -> str:
33
+ """Extracts text from the uploaded PDF."""
34
  with fitz.open(pdf_path) as doc:
35
  return "".join(page.get_text() for page in doc).strip()
36
 
37
  def chunk_text(text: str, max_tokens: int = 512) -> list:
38
+ """Splits the text into manageable chunks that fit within the token limit."""
39
  tokens = tokenizer.encode(text, add_special_tokens=False)
40
  return [tokenizer.decode(tokens[i:i + max_tokens], skip_special_tokens=True)
41
  for i in range(0, len(tokens), max_tokens)]
42
 
43
  def process_chunk(chunk: str) -> dict:
44
+ """Uses the model to extract structured JSON data from a text chunk."""
45
  prompt = f"""
46
  Extract the following information in JSON format:
47
  {{
 
59
  }}
60
  Resume text: {chunk}
61
  """
62
+
 
63
  try:
64
+ response = qa_pipeline(prompt, max_new_tokens=150)
65
+ generated_text = response[0]["generated_text"]
66
+ # Extract JSON from the generated text
67
+ json_start = generated_text.find("{")
68
+ json_end = generated_text.rfind("}") + 1
69
+ return json.loads(generated_text[json_start:json_end])
70
+ except (json.JSONDecodeError, IndexError) as e:
71
+ logger.warning(f"Failed to parse JSON: {e}")
72
  return {}
73
 
74
  def merge_outputs(chunks: list) -> dict:
 
82
  "hse_description": "",
83
  "good_conduct_certificate": ""
84
  }
85
+
86
  for chunk in chunks:
87
+ chunk_output = process_chunk(chunk)
88
+ for key, value in chunk_output.items():
89
  if isinstance(value, dict):
90
  merged[key].update(value)
91
  elif not merged[key]:
92
  merged[key] = value
93
+
94
  return merged
95
 
96
  @app.post("/process_cv/", response_model=ExtractedInfo)
97
  async def process_cv(file: UploadFile = File(...), background_tasks: BackgroundTasks = BackgroundTasks()):
98
+ """Processes a PDF resume and returns structured information in JSON format."""
99
  if not file.filename.endswith(".pdf"):
100
+ raise HTTPException(status_code=400, detail="Only PDF files are allowed.")
101
 
102
  temp_path = tempfile.mktemp(suffix=".pdf")
103
  with open(temp_path, "wb") as f:
104
  f.write(await file.read())
105
 
106
  try:
107
+ # Extract text from the uploaded PDF
108
  text = extract_text_from_pdf(temp_path)
109
  if not text:
110
+ raise HTTPException(status_code=400, detail="No extractable text found in the PDF.")
111
 
112
+ # Process the text in chunks and merge the output
113
  chunks = chunk_text(text)
114
+ structured_data = merge_outputs(chunks)
115
  return ExtractedInfo(**structured_data)
116
  finally:
117
  os.remove(temp_path)