Spaces:
Runtime error
Runtime error
File size: 6,075 Bytes
ef33567 0df0b93 ef33567 bea313d ef33567 05950f9 f4f6fae ef33567 bea313d ef33567 bea313d ef33567 05950f9 ef33567 0df0b93 05950f9 ef33567 05950f9 ef33567 05950f9 ef33567 05950f9 ef33567 05950f9 ef33567 05950f9 ef33567 0df0b93 ef33567 f4f6fae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
import os
import openai
import gradio as gr
import pdfplumber
import boto3
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.node_parser import SentenceWindowNodeParser
from dotenv import load_dotenv
import re
# Load environment variables
load_dotenv("config.env")
# Set your OpenAI API key here
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai.api_key = OPENAI_API_KEY
# AWS S3 setup
s3_bucket_name = "sagemaker-studio-gm4vm5dimae"
s3_client = boto3.client('s3')
# Directory to store downloaded PDFs
resume_path = 'resumes'
os.makedirs(resume_path, exist_ok=True)
# Function to download PDFs from S3
def download_pdfs_from_s3(bucket_name, local_path):
objects = s3_client.list_objects_v2(Bucket=bucket_name)
for obj in objects.get('Contents', []):
file_name = obj['Key']
local_file_path = os.path.join(local_path, file_name)
s3_client.download_file(bucket_name, file_name, local_file_path)
print(f"Downloaded {file_name} to {local_file_path}")
# Download PDFs
download_pdfs_from_s3(s3_bucket_name, resume_path)
# Function to load PDFs using pdfplumber
def load_pdfs_with_pdfplumber(directory):
documents = []
for filename in os.listdir(directory):
if filename.endswith(".pdf"):
try:
with pdfplumber.open(os.path.join(directory, filename)) as pdf:
text = ""
for page in pdf.pages:
text += page.extract_text() or ""
documents.append(Document(text=text))
except Exception as e:
print(f"Error processing {filename}: {e}")
return documents
# Load documents from the resume directory using pdfplumber
documents = load_pdfs_with_pdfplumber(resume_path)
print(f"Number of documents: {len(documents)}")
# Set up the LLM (GPT-4o)
llm = OpenAI(model="gpt-4o", temperature=0.9)
# Set up the embedding model
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
# Create sentence window node parser with default settings
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_text"
)
# Configure global settings
Settings.llm = llm
Settings.embed_model = embed_model
Settings.node_parser = sentence_node_parser
# Create index
index = VectorStoreIndex.from_documents(documents)
# Function to filter PII from text (removes emails, phone numbers, etc.)
def filter_pii(text):
# Regular expressions for email, phone numbers, and sensitive patterns
email_pattern = r"\S+@\S+\.\S+"
phone_pattern = r"\+?\d[\d\s()-]{8,}\d"
# Replace found patterns with [REDACTED]
text = re.sub(email_pattern, "[REDACTED]", text)
text = re.sub(phone_pattern, "[REDACTED]", text)
return text
# Custom reranking function
def custom_rerank(nodes, query):
rerank_prompt = (
"Given the following query and text chunks, rate each chunk's relevance "
"to the query on a scale of 1-10, where 10 is most relevant.\n\n"
f"Query: {query}\n\n"
)
for i, node in enumerate(nodes):
rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
response = llm.complete(rerank_prompt)
try:
ratings = [int(r.strip()) for r in response.text.split(',')]
if len(ratings) != len(nodes):
raise ValueError("Number of ratings does not match number of nodes")
sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
return sorted_nodes[:5] # Return top 5 reranked nodes
except Exception as e:
print(f"Error in reranking: {e}, returning original order")
return nodes[:5]
# Create query engine
query_engine = index.as_query_engine(
similarity_top_k=20,
node_postprocessors=[
MetadataReplacementPostProcessor("window")
],
)
# Chatbot function with PII filter
def chatbot(message, history):
history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
# Retrieve nodes
retrieved_nodes = query_engine.retrieve(full_query)
# Apply custom reranking
reranked_nodes = custom_rerank(retrieved_nodes, full_query)
# Synthesize answer from reranked nodes
context = "\n".join([node.get_content() for node in reranked_nodes])
# Get the response from LLM
response = llm.complete(
f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
)
# Filter PII from the response
filtered_response = filter_pii(response.text)
return filtered_response
# Create Gradio interface with comprehensive instructions
iface = gr.ChatInterface(
chatbot,
title="Resume Chatbot - Secure Candidate Query",
description=(
"This is a Resume Chatbot that answers questions about candidate experience and qualifications. "
"It will not reveal any private information beyond names of candidates. Please ask questions about skills, "
"experience, or qualifications without requesting sensitive personal information."
),
theme="soft",
examples=[
"Out of all the resumes, tell me three of them who have experience in SQL?",
"Give me key summary takeaways of the resumes who have experience in Project Management?",
"Give me the names of 10 candidates who have more than two years of experience in general?",
],
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
chatbot=gr.Chatbot(height=400),
textbox=gr.Textbox(scale=5)
)
# Launch the interface
iface.launch(share=True) |