Namanj46 commited on
Commit
ef33567
1 Parent(s): 61005f4

Initial commit

Browse files
.ipynb_checkpoints/app-checkpoint.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import openai
3
+ import gradio as gr
4
+ import pdfplumber
5
+ from llama_index.core import Document, VectorStoreIndex, Settings
6
+ from llama_index.llms.openai import OpenAI
7
+ from llama_index.embeddings.openai import OpenAIEmbedding
8
+ from llama_index.core.postprocessor import MetadataReplacementPostProcessor
9
+ from llama_index.core.node_parser import SentenceWindowNodeParser
10
+
11
+ # Set your OpenAI API key here
12
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
13
+ openai.api_key = OPENAI_API_KEY
14
+
15
+ # Get the current working directory and join with 'resumes'
16
+ resume_path = 'resumes'
17
+ if not os.path.exists(resume_path):
18
+ raise ValueError(f"Directory 'resumes' not found")
19
+
20
+ # Function to load PDFs using pdfplumber
21
+ def load_pdfs_with_pdfplumber(directory):
22
+ documents = []
23
+ for filename in os.listdir(directory):
24
+ if filename.endswith(".pdf"):
25
+ try:
26
+ with pdfplumber.open(os.path.join(directory, filename)) as pdf:
27
+ text = ""
28
+ for page in pdf.pages:
29
+ text += page.extract_text() or ""
30
+ documents.append(Document(text=text))
31
+ except Exception as e:
32
+ print(f"Error processing {filename}: {e}")
33
+ return documents
34
+
35
+ # Load documents from the resume directory using pdfplumber
36
+ documents = load_pdfs_with_pdfplumber(resume_path)
37
+ print(f"Number of documents: {len(documents)}")
38
+
39
+ # Set up the LLM (GPT-4o)
40
+ llm = OpenAI(model="gpt-4o", temperature=0.9)
41
+
42
+ # Set up the embedding model
43
+ embed_model = OpenAIEmbedding(model="text-embedding-3-large")
44
+
45
+ # Create sentence window node parser with default settings
46
+ sentence_node_parser = SentenceWindowNodeParser.from_defaults(
47
+ window_size=3,
48
+ window_metadata_key="window",
49
+ original_text_metadata_key="original_text"
50
+ )
51
+
52
+ # Configure global settings
53
+ Settings.llm = llm
54
+ Settings.embed_model = embed_model
55
+ Settings.node_parser = sentence_node_parser
56
+
57
+ # Create index
58
+ index = VectorStoreIndex.from_documents(documents)
59
+
60
+ # Custom reranking function
61
+ def custom_rerank(nodes, query):
62
+ rerank_prompt = (
63
+ "Given the following query and text chunks, rate each chunk's relevance "
64
+ "to the query on a scale of 1-10, where 10 is most relevant.\n\n"
65
+ f"Query: {query}\n\n"
66
+ )
67
+ for i, node in enumerate(nodes):
68
+ rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
69
+
70
+ rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
71
+
72
+ response = llm.complete(rerank_prompt)
73
+ try:
74
+ ratings = [int(r.strip()) for r in response.text.split(',')]
75
+ if len(ratings) != len(nodes):
76
+ raise ValueError("Number of ratings does not match number of nodes")
77
+ sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
78
+ return sorted_nodes[:5] # Return top 5 reranked nodes
79
+ except Exception as e:
80
+ print(f"Error in reranking: {e}, returning original order")
81
+ return nodes[:5]
82
+
83
+ # Create query engine
84
+ query_engine = index.as_query_engine(
85
+ similarity_top_k=20,
86
+ node_postprocessors=[
87
+ MetadataReplacementPostProcessor("window")
88
+ ],
89
+ )
90
+
91
+ # Chatbot function
92
+ def chatbot(message, history):
93
+ history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
94
+ full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
95
+
96
+ # Retrieve nodes
97
+ retrieved_nodes = query_engine.retrieve(full_query)
98
+
99
+ # Apply custom reranking
100
+ reranked_nodes = custom_rerank(retrieved_nodes, full_query)
101
+
102
+ # Synthesize answer from reranked nodes
103
+ context = "\n".join([node.get_content() for node in reranked_nodes])
104
+ response = llm.complete(
105
+ f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
106
+ )
107
+
108
+ return response.text
109
+
110
+ # Create Gradio interface
111
+ iface = gr.ChatInterface(
112
+ chatbot,
113
+ title="Resume Chatbot",
114
+ description="Ask questions about resumes in the database.",
115
+ theme="soft",
116
+ examples=[
117
+ "Out of all the resumes tell me three of them who have experience in SQL?",
118
+ "Give me key summary takeaways of the resumes who have experience in Project Management?",
119
+ "Give me the names of 10 candidates who have more than two years of experience in general?",
120
+ ],
121
+ retry_btn=None,
122
+ undo_btn="Delete Previous",
123
+ clear_btn="Clear",
124
+ chatbot=gr.Chatbot(height=400),
125
+ textbox=gr.Textbox(scale=5)
126
+ )
127
+
128
+ # Launch the interface
129
+ iface.launch(share=True)
.ipynb_checkpoints/config-checkpoint.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
.ipynb_checkpoints/requirements-checkpoint.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ huggingface_hub==0.22.2
app.py CHANGED
@@ -1,63 +1,132 @@
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  ],
59
  )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
- if __name__ == "__main__":
63
- demo.launch()
 
1
+ import os
2
+ import openai
3
  import gradio as gr
4
+ import pdfplumber
5
+ from llama_index.core import Document, VectorStoreIndex, Settings
6
+ from llama_index.llms.openai import OpenAI
7
+ from llama_index.embeddings.openai import OpenAIEmbedding
8
+ from llama_index.core.postprocessor import MetadataReplacementPostProcessor
9
+ from llama_index.core.node_parser import SentenceWindowNodeParser
10
+
11
+ from dotenv import load_dotenv
12
+ load_dotenv("config.env")
13
+
14
+ # Set your OpenAI API key here
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
16
+ openai.api_key = OPENAI_API_KEY
17
+
18
+ # Get the current working directory and join with 'resumes'
19
+ resume_path = 'resumes'
20
+ if not os.path.exists(resume_path):
21
+ raise ValueError(f"Directory 'resumes' not found")
22
+
23
+ # Function to load PDFs using pdfplumber
24
+ def load_pdfs_with_pdfplumber(directory):
25
+ documents = []
26
+ for filename in os.listdir(directory):
27
+ if filename.endswith(".pdf"):
28
+ try:
29
+ with pdfplumber.open(os.path.join(directory, filename)) as pdf:
30
+ text = ""
31
+ for page in pdf.pages:
32
+ text += page.extract_text() or ""
33
+ documents.append(Document(text=text))
34
+ except Exception as e:
35
+ print(f"Error processing {filename}: {e}")
36
+ return documents
37
+
38
+ # Load documents from the resume directory using pdfplumber
39
+ documents = load_pdfs_with_pdfplumber(resume_path)
40
+ print(f"Number of documents: {len(documents)}")
41
+
42
+ # Set up the LLM (GPT-4o)
43
+ llm = OpenAI(model="gpt-4o", temperature=0.9)
44
+
45
+ # Set up the embedding model
46
+ embed_model = OpenAIEmbedding(model="text-embedding-3-large")
47
+
48
+ # Create sentence window node parser with default settings
49
+ sentence_node_parser = SentenceWindowNodeParser.from_defaults(
50
+ window_size=3,
51
+ window_metadata_key="window",
52
+ original_text_metadata_key="original_text"
53
+ )
54
+
55
+ # Configure global settings
56
+ Settings.llm = llm
57
+ Settings.embed_model = embed_model
58
+ Settings.node_parser = sentence_node_parser
59
+
60
+ # Create index
61
+ index = VectorStoreIndex.from_documents(documents)
62
+
63
+ # Custom reranking function
64
+ def custom_rerank(nodes, query):
65
+ rerank_prompt = (
66
+ "Given the following query and text chunks, rate each chunk's relevance "
67
+ "to the query on a scale of 1-10, where 10 is most relevant.\n\n"
68
+ f"Query: {query}\n\n"
69
+ )
70
+ for i, node in enumerate(nodes):
71
+ rerank_prompt += f"Chunk {i+1}:\n{node.get_content()}\n\n"
72
+
73
+ rerank_prompt += "Provide your ratings as a comma-separated list of numbers, e.g., '7,4,9,2,6'"
74
+
75
+ response = llm.complete(rerank_prompt)
76
+ try:
77
+ ratings = [int(r.strip()) for r in response.text.split(',')]
78
+ if len(ratings) != len(nodes):
79
+ raise ValueError("Number of ratings does not match number of nodes")
80
+ sorted_nodes = [node for _, node in sorted(zip(ratings, nodes), key=lambda x: x[0], reverse=True)]
81
+ return sorted_nodes[:5] # Return top 5 reranked nodes
82
+ except Exception as e:
83
+ print(f"Error in reranking: {e}, returning original order")
84
+ return nodes[:5]
85
+
86
+ # Create query engine
87
+ query_engine = index.as_query_engine(
88
+ similarity_top_k=20,
89
+ node_postprocessors=[
90
+ MetadataReplacementPostProcessor("window")
91
  ],
92
  )
93
 
94
+ # Chatbot function
95
+ def chatbot(message, history):
96
+ history_text = "\n".join([f"Human: {h[0]}\nAI: {h[1]}" for h in history])
97
+ full_query = f"Given the following chat history:\n{history_text}\n\nHuman: {message}\nAI:"
98
+
99
+ # Retrieve nodes
100
+ retrieved_nodes = query_engine.retrieve(full_query)
101
+
102
+ # Apply custom reranking
103
+ reranked_nodes = custom_rerank(retrieved_nodes, full_query)
104
+
105
+ # Synthesize answer from reranked nodes
106
+ context = "\n".join([node.get_content() for node in reranked_nodes])
107
+ response = llm.complete(
108
+ f"Using the following context, answer the query:\n\nContext: {context}\n\nQuery: {full_query}"
109
+ )
110
+
111
+ return response.text
112
+
113
+ # Create Gradio interface
114
+ iface = gr.ChatInterface(
115
+ chatbot,
116
+ title="Resume Chatbot",
117
+ description="Ask questions about resumes in the database.",
118
+ theme="soft",
119
+ examples=[
120
+ "Out of all the resumes tell me three of them who have experience in SQL?",
121
+ "Give me key summary takeaways of the resumes who have experience in Project Management?",
122
+ "Give me the names of 10 candidates who have more than two years of experience in general?",
123
+ ],
124
+ retry_btn=None,
125
+ undo_btn="Delete Previous",
126
+ clear_btn="Clear",
127
+ chatbot=gr.Chatbot(height=400),
128
+ textbox=gr.Textbox(scale=5)
129
+ )
130
 
131
+ # Launch the interface
132
+ iface.launch(share=True)
config.env ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENAI_API_KEY=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
requirements.txt CHANGED
@@ -1 +1,4 @@
1
- huggingface_hub==0.22.2
 
 
 
 
1
+ gradio
2
+ pdfplumber
3
+ openai
4
+ llama_index