Spaces:
Runtime error
Runtime error
Initial commit
Browse files- .ipynb_checkpoints/app-checkpoint.py +129 -0
- .ipynb_checkpoints/config-checkpoint.env +1 -0
- .ipynb_checkpoints/requirements-checkpoint.txt +1 -0
- app.py +127 -58
- config.env +1 -0
- requirements.txt +4 -1
.ipynb_checkpoints/app-checkpoint.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
+
import gradio as gr
|
4 |
+
import pdfplumber
|
5 |
+
from llama_index.core import Document, VectorStoreIndex, Settings
|
6 |
+
from llama_index.llms.openai import OpenAI
|
7 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
+
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
9 |
+
from llama_index.core.node_parser import SentenceWindowNodeParser
|
10 |
+
|
11 |
+
# Set your OpenAI API key here
|
12 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
13 |
+
openai.api_key = OPENAI_API_KEY
|
14 |
+
|
15 |
+
# Get the current working directory and join with 'resumes'
|
16 |
+
resume_path = 'resumes'
|
17 |
+
if not os.path.exists(resume_path):
|
18 |
+
raise ValueError(f"Directory 'resumes' not found")
|
19 |
+
|
20 |
+
# Function to load PDFs using pdfplumber
|
21 |
+
def load_pdfs_with_pdfplumber(directory):
|
22 |
+
documents = []
|
23 |
+
for filename in os.listdir(directory):
|
24 |
+
if filename.endswith(".pdf"):
|
25 |
+
try:
|
26 |
+
with pdfplumber.open(os.path.join(directory, filename)) as pdf:
|
27 |
+
text = ""
|
28 |
+
for page in pdf.pages:
|
29 |
+
text += page.extract_text() or ""
|
30 |
+
documents.append(Document(text=text))
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error processing {filename}: {e}")
|
33 |
+
return documents
|
34 |
+
|
35 |
+
# Load documents from the resume directory using pdfplumber
|
36 |
+
documents = load_pdfs_with_pdfplumber(resume_path)
|
37 |
+
print(f"Number of documents: {len(documents)}")
|
38 |
+
|
39 |
+
# Set up the LLM (GPT-4o)
|
40 |
+
llm = OpenAI(model="gpt-4o", temperature=0.9)
|
41 |
+
|
42 |
+
# Set up the embedding model
|
43 |
+
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
|
44 |
+
|
45 |
+
# Create sentence window node parser with default settings
|
46 |
+
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
|
47 |
+
window_size=3,
|
48 |
+
window_metadata_key="window",
|
49 |
+
original_text_metadata_key="original_text"
|
50 |
+
)
|
51 |
+
|
52 |
+
# Configure global settings
|
53 |
+
Settings.llm = llm
|
54 |
+
Settings.embed_model = embed_model
|
55 |
+
Settings.node_parser = sentence_node_parser
|
56 |
+
|
57 |
+
# Create index
|
58 |
+
index = VectorStoreIndex.from_documents(documents)
|
59 |
+
|
60 |
+
# Custom reranking function
|
61 |
+
def custom_rerank(nodes, query):
|
62 |
+
rerank_prompt = (
|
63 |
+
"Given the following query and text chunks, rate each chunk's relevance "
|
64 |
+
"to the query on a scale of 1-10, where 10 is most relevant.\n\n"
|
65 |
+
f"Query: {query}\n\n"
|
66 |
+
)
|
67 |
+
for i, node in enumerate(nodes):
|
68 |
+
rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
|
69 |
+
|
70 |
+
rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
|
71 |
+
|
72 |
+
response = llm.complete(rerank_prompt)
|
73 |
+
try:
|
74 |
+
ratings = [int(r.strip()) for r in response.text.split(',')]
|
75 |
+
if len(ratings) != len(nodes):
|
76 |
+
raise ValueError("Number of ratings does not match number of nodes")
|
77 |
+
sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
|
78 |
+
return sorted_nodes[:5] # Return top 5 reranked nodes
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error in reranking: {e}, returning original order")
|
81 |
+
return nodes[:5]
|
82 |
+
|
83 |
+
# Create query engine
|
84 |
+
query_engine = index.as_query_engine(
|
85 |
+
similarity_top_k=20,
|
86 |
+
node_postprocessors=[
|
87 |
+
MetadataReplacementPostProcessor("window")
|
88 |
+
],
|
89 |
+
)
|
90 |
+
|
91 |
+
# Chatbot function
|
92 |
+
def chatbot(message, history):
|
93 |
+
history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
|
94 |
+
full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
|
95 |
+
|
96 |
+
# Retrieve nodes
|
97 |
+
retrieved_nodes = query_engine.retrieve(full_query)
|
98 |
+
|
99 |
+
# Apply custom reranking
|
100 |
+
reranked_nodes = custom_rerank(retrieved_nodes, full_query)
|
101 |
+
|
102 |
+
# Synthesize answer from reranked nodes
|
103 |
+
context = "\n".join([node.get_content() for node in reranked_nodes])
|
104 |
+
response = llm.complete(
|
105 |
+
f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
|
106 |
+
)
|
107 |
+
|
108 |
+
return response.text
|
109 |
+
|
110 |
+
# Create Gradio interface
|
111 |
+
iface = gr.ChatInterface(
|
112 |
+
chatbot,
|
113 |
+
title="Resume Chatbot",
|
114 |
+
description="Ask questions about resumes in the database.",
|
115 |
+
theme="soft",
|
116 |
+
examples=[
|
117 |
+
"Out of all the resumes tell me three of them who have experience in SQL?",
|
118 |
+
"Give me key summary takeaways of the resumes who have experience in Project Management?",
|
119 |
+
"Give me the names of 10 candidates who have more than two years of experience in general?",
|
120 |
+
],
|
121 |
+
retry_btn=None,
|
122 |
+
undo_btn="Delete Previous",
|
123 |
+
clear_btn="Clear",
|
124 |
+
chatbot=gr.Chatbot(height=400),
|
125 |
+
textbox=gr.Textbox(scale=5)
|
126 |
+
)
|
127 |
+
|
128 |
+
# Launch the interface
|
129 |
+
iface.launch(share=True)
|
.ipynb_checkpoints/config-checkpoint.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
.ipynb_checkpoints/requirements-checkpoint.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
huggingface_hub==0.22.2
|
app.py
CHANGED
@@ -1,63 +1,132 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
],
|
59 |
)
|
60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
-
|
63 |
-
|
|
|
1 |
+
import os
|
2 |
+
import openai
|
3 |
import gradio as gr
|
4 |
+
import pdfplumber
|
5 |
+
from llama_index.core import Document, VectorStoreIndex, Settings
|
6 |
+
from llama_index.llms.openai import OpenAI
|
7 |
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
8 |
+
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
|
9 |
+
from llama_index.core.node_parser import SentenceWindowNodeParser
|
10 |
+
|
11 |
+
from dotenv import load_dotenv
|
12 |
+
load_dotenv("config.env")
|
13 |
+
|
14 |
+
# Set your OpenAI API key here
|
15 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
16 |
+
openai.api_key = OPENAI_API_KEY
|
17 |
+
|
18 |
+
# Get the current working directory and join with 'resumes'
|
19 |
+
resume_path = 'resumes'
|
20 |
+
if not os.path.exists(resume_path):
|
21 |
+
raise ValueError(f"Directory 'resumes' not found")
|
22 |
+
|
23 |
+
# Function to load PDFs using pdfplumber
|
24 |
+
def load_pdfs_with_pdfplumber(directory):
|
25 |
+
documents = []
|
26 |
+
for filename in os.listdir(directory):
|
27 |
+
if filename.endswith(".pdf"):
|
28 |
+
try:
|
29 |
+
with pdfplumber.open(os.path.join(directory, filename)) as pdf:
|
30 |
+
text = ""
|
31 |
+
for page in pdf.pages:
|
32 |
+
text += page.extract_text() or ""
|
33 |
+
documents.append(Document(text=text))
|
34 |
+
except Exception as e:
|
35 |
+
print(f"Error processing {filename}: {e}")
|
36 |
+
return documents
|
37 |
+
|
38 |
+
# Load documents from the resume directory using pdfplumber
|
39 |
+
documents = load_pdfs_with_pdfplumber(resume_path)
|
40 |
+
print(f"Number of documents: {len(documents)}")
|
41 |
+
|
42 |
+
# Set up the LLM (GPT-4o)
|
43 |
+
llm = OpenAI(model="gpt-4o", temperature=0.9)
|
44 |
+
|
45 |
+
# Set up the embedding model
|
46 |
+
embed_model = OpenAIEmbedding(model="text-embedding-3-large")
|
47 |
+
|
48 |
+
# Create sentence window node parser with default settings
|
49 |
+
sentence_node_parser = SentenceWindowNodeParser.from_defaults(
|
50 |
+
window_size=3,
|
51 |
+
window_metadata_key="window",
|
52 |
+
original_text_metadata_key="original_text"
|
53 |
+
)
|
54 |
+
|
55 |
+
# Configure global settings
|
56 |
+
Settings.llm = llm
|
57 |
+
Settings.embed_model = embed_model
|
58 |
+
Settings.node_parser = sentence_node_parser
|
59 |
+
|
60 |
+
# Create index
|
61 |
+
index = VectorStoreIndex.from_documents(documents)
|
62 |
+
|
63 |
+
# Custom reranking function
|
64 |
+
def custom_rerank(nodes, query):
|
65 |
+
rerank_prompt = (
|
66 |
+
"Given the following query and text chunks, rate each chunk's relevance "
|
67 |
+
"to the query on a scale of 1-10, where 10 is most relevant.\n\n"
|
68 |
+
f"Query: {query}\n\n"
|
69 |
+
)
|
70 |
+
for i, node in enumerate(nodes):
|
71 |
+
rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
|
72 |
+
|
73 |
+
rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
|
74 |
+
|
75 |
+
response = llm.complete(rerank_prompt)
|
76 |
+
try:
|
77 |
+
ratings = [int(r.strip()) for r in response.text.split(',')]
|
78 |
+
if len(ratings) != len(nodes):
|
79 |
+
raise ValueError("Number of ratings does not match number of nodes")
|
80 |
+
sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
|
81 |
+
return sorted_nodes[:5] # Return top 5 reranked nodes
|
82 |
+
except Exception as e:
|
83 |
+
print(f"Error in reranking: {e}, returning original order")
|
84 |
+
return nodes[:5]
|
85 |
+
|
86 |
+
# Create query engine
|
87 |
+
query_engine = index.as_query_engine(
|
88 |
+
similarity_top_k=20,
|
89 |
+
node_postprocessors=[
|
90 |
+
MetadataReplacementPostProcessor("window")
|
91 |
],
|
92 |
)
|
93 |
|
94 |
+
# Chatbot function
|
95 |
+
def chatbot(message, history):
|
96 |
+
history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
|
97 |
+
full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
|
98 |
+
|
99 |
+
# Retrieve nodes
|
100 |
+
retrieved_nodes = query_engine.retrieve(full_query)
|
101 |
+
|
102 |
+
# Apply custom reranking
|
103 |
+
reranked_nodes = custom_rerank(retrieved_nodes, full_query)
|
104 |
+
|
105 |
+
# Synthesize answer from reranked nodes
|
106 |
+
context = "\n".join([node.get_content() for node in reranked_nodes])
|
107 |
+
response = llm.complete(
|
108 |
+
f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
|
109 |
+
)
|
110 |
+
|
111 |
+
return response.text
|
112 |
+
|
113 |
+
# Create Gradio interface
|
114 |
+
iface = gr.ChatInterface(
|
115 |
+
chatbot,
|
116 |
+
title="Resume Chatbot",
|
117 |
+
description="Ask questions about resumes in the database.",
|
118 |
+
theme="soft",
|
119 |
+
examples=[
|
120 |
+
"Out of all the resumes tell me three of them who have experience in SQL?",
|
121 |
+
"Give me key summary takeaways of the resumes who have experience in Project Management?",
|
122 |
+
"Give me the names of 10 candidates who have more than two years of experience in general?",
|
123 |
+
],
|
124 |
+
retry_btn=None,
|
125 |
+
undo_btn="Delete Previous",
|
126 |
+
clear_btn="Clear",
|
127 |
+
chatbot=gr.Chatbot(height=400),
|
128 |
+
textbox=gr.Textbox(scale=5)
|
129 |
+
)
|
130 |
|
131 |
+
# Launch the interface
|
132 |
+
iface.launch(share=True)
|
config.env
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
OPENAI_API_KEY=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
|
requirements.txt
CHANGED
@@ -1 +1,4 @@
|
|
1 |
-
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pdfplumber
|
3 |
+
openai
|
4 |
+
llama_index
|