forestav commited on
Commit
af91fc8
Β·
1 Parent(s): 05e61be

update gradio ui and create database

Browse files
Files changed (3) hide show
  1. feedback.db +0 -0
  2. gradio.py +0 -276
  3. gradioapp.py +261 -0
feedback.db ADDED
Binary file (61.4 kB). View file
 
gradio.py DELETED
@@ -1,276 +0,0 @@
1
- import gradio as gr
2
- import PyPDF2
3
- import docx2txt
4
- import re
5
- from typing import Optional
6
- from datetime import datetime
7
-
8
- # --- Import your custom modules
9
- from pinecone_handler import PineconeHandler
10
- from time_handling import read_timestamp
11
- from settings import DATE_FORMAT
12
-
13
- # ------------------------------------------------------------------
14
- # Global or session-level store for job data
15
- # ------------------------------------------------------------------
16
- MAX_RESULTS = 10 # Up to 10 job ads displayed
17
- JOBS_CACHE = [None] * MAX_RESULTS # Each element will hold (ad_id, ad_metadata, full_resume_text)
18
-
19
-
20
- # ------------------------------------------------------------------
21
- # Helper functions (same as your original ones)
22
- # ------------------------------------------------------------------
23
- def extract_text_from_pdf(pdf_file) -> str:
24
- pdf_reader = PyPDF2.PdfReader(pdf_file)
25
- text = ""
26
- for page in pdf_reader.pages:
27
- text += page.extract_text() + "\n"
28
- return text
29
-
30
- def extract_text_from_docx(docx_file) -> str:
31
- text = docx2txt.process(docx_file)
32
- return text
33
-
34
- def extract_resume_text(uploaded_file) -> Optional[str]:
35
- if uploaded_file is None:
36
- return None
37
-
38
- file_extension = uploaded_file.name.split('.')[-1].lower()
39
- try:
40
- if file_extension == 'pdf':
41
- return extract_text_from_pdf(uploaded_file)
42
- elif file_extension in ['docx', 'doc']:
43
- return extract_text_from_docx(uploaded_file.name)
44
- elif file_extension == 'txt':
45
- return uploaded_file.read().decode("utf-8", errors="replace")
46
- else:
47
- return f"ERROR: Unsupported file format: {file_extension}"
48
- except Exception as e:
49
- return f"ERROR: {str(e)}"
50
-
51
- def clean_resume_text(text: str) -> str:
52
- if not text:
53
- return ""
54
- # Remove special characters and extra whitespace
55
- text = re.sub(r'\s+', ' ', text)
56
- return text.strip()
57
-
58
- def is_description_truncated(description: str) -> bool:
59
- truncation_indicators = [
60
- lambda x: len(x) >= 995, # close to 1000 char limit
61
- lambda x: x.rstrip().endswith(('...', '…')),
62
- lambda x: re.search(r'\w+$', x) and not re.search(r'[.!?]$', x),
63
- ]
64
- return any(indicator(description) for indicator in truncation_indicators)
65
-
66
- def format_job_description(description: str, truncated: bool = False) -> str:
67
- if not description:
68
- return ""
69
-
70
- sections = [
71
- "About us", "About you", "About the role", "About the position",
72
- "Requirements", "Qualifications", "Skills", "Responsibilities",
73
- "What you'll do", "What we offer", "Benefits", "Your profile",
74
- "Required skills", "What you need", "Who you are"
75
- ]
76
-
77
- formatted_text = description
78
- for section in sections:
79
- pattern = re.compile(f'({section}:?)', re.IGNORECASE)
80
- formatted_text = pattern.sub(r'\n\n\\1', formatted_text)
81
-
82
- formatted_text = re.sub(r'[β€’-]\s*', '\nβ€’ ', formatted_text)
83
- formatted_text = re.sub(r'(?<=\w)\.(?=\s*[A-Z])', '.\n', formatted_text)
84
- formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
85
-
86
- if truncated:
87
- formatted_text = formatted_text.rstrip() + "..."
88
-
89
- return formatted_text.strip()
90
-
91
-
92
- # ------------------------------------------------------------------
93
- # Callback for Like/Dislike
94
- # ------------------------------------------------------------------
95
- def user_interaction(index_in_cache, action):
96
- """
97
- index_in_cache: which job row's button was clicked (0..MAX_RESULTS-1)
98
- action: 'like' or 'dislike'
99
-
100
- We'll retrieve:
101
- - ad_id
102
- - resume_text
103
- - possibly do something with them (e.g. store in DB)
104
- """
105
- if index_in_cache < 0 or index_in_cache >= MAX_RESULTS:
106
- return "Invalid job index."
107
-
108
- cached = JOBS_CACHE[index_in_cache]
109
- if not cached:
110
- return "No job data at this slot."
111
-
112
- ad_id, metadata, full_resume_text = cached
113
-
114
- # Example logging or storing
115
- # In reality, you might store this info in a database or call an API
116
- print(f"[USER_INTERACTION] Action={action}, AdID={ad_id}, CV length={len(full_resume_text)} chars.")
117
-
118
- return f"You {action}d job {ad_id}."
119
-
120
-
121
- # ------------------------------------------------------------------
122
- # Callback to search jobs
123
- # ------------------------------------------------------------------
124
- def search_jobs(resume_file, num_results, city_filter):
125
- """
126
- 1) Extract + clean resume
127
- 2) Query Pinecone
128
- 3) Populate the placeholders for up to MAX_RESULTS job ads
129
- 4) Return status message
130
- """
131
- # Clear out global cache
132
- for i in range(MAX_RESULTS):
133
- JOBS_CACHE[i] = None
134
-
135
- if resume_file is None:
136
- return "Please upload a resume first."
137
-
138
- resume_text = extract_resume_text(resume_file)
139
- if resume_text is None or resume_text.startswith("ERROR"):
140
- return f"Error processing file: {resume_text}"
141
-
142
- clean_text = clean_resume_text(resume_text)
143
- if not clean_text:
144
- return "No text extracted from resume or file is invalid."
145
-
146
- # Attempt to read the database update time
147
- try:
148
- last_update = read_timestamp()
149
- last_update_dt = datetime.strptime(last_update, DATE_FORMAT)
150
- db_info = f"**Database last update:** {last_update_dt.strftime('%B %d, %Y at %I:%M %p')} (Stockholm Time)\n\n"
151
- except Exception as e:
152
- db_info = f"Error reading timestamp: {str(e)}\n\n"
153
-
154
- # Pinecone init
155
- try:
156
- handler = PineconeHandler()
157
- except Exception as e:
158
- return f"{db_info}Error connecting to Pinecone: {str(e)}"
159
-
160
- # Search
161
- try:
162
- results = handler.search_similar_ads(
163
- clean_text, top_k=num_results, city=city_filter.strip()
164
- )
165
- except Exception as e:
166
- return f"{db_info}Error searching jobs: {str(e)}"
167
-
168
- if not results:
169
- return f"{db_info}No matching jobs found."
170
-
171
- # Fill up to MAX_RESULTS
172
- text_output = [db_info + f"**Found {len(results)} matching jobs:**\n"]
173
-
174
- for i, match in enumerate(results[:MAX_RESULTS]):
175
- metadata = match.metadata
176
- score = match.score
177
-
178
- # We'll store data in our global JOBS_CACHE so user_interaction can retrieve it
179
- # You might have an 'id' or something in metadata that you treat as the ad_id
180
- ad_id = str(metadata.get('job_id', f"Unknown_{i}"))
181
- JOBS_CACHE[i] = (ad_id, metadata, clean_text)
182
-
183
- headline = metadata.get('headline', 'Untitled')
184
- city = metadata.get('city', 'Unknown City')
185
- occupation = metadata.get('occupation', 'Unknown Occupation')
186
- published = metadata.get('published', 'Unknown Date')
187
- desc = metadata.get('description', '')
188
- truncated = is_description_truncated(desc)
189
- snippet = desc[:2000] if truncated else desc
190
- formatted_desc = format_job_description(snippet, truncated=truncated)
191
-
192
- text_output.append(f"### {i+1}. {headline}")
193
- text_output.append(f"**Ad ID**: `{ad_id}`")
194
- text_output.append(f"**Match Score (Cosine)**: {score:.2f}")
195
- text_output.append(f"**Location**: {city}")
196
- text_output.append(f"**Occupation**: {occupation}")
197
- text_output.append(f"**Published**: {published}")
198
- text_output.append(formatted_desc or "*No description*")
199
-
200
- if truncated:
201
- text_output.append(
202
- "> **Note**: Description truncated. See original link for full details."
203
- )
204
- if 'webpage_url' in metadata:
205
- text_output.append(f"[View Original]({metadata['webpage_url']})")
206
-
207
- text_output.append("---")
208
-
209
- return "\n".join(text_output)
210
-
211
-
212
- # ------------------------------------------------------------------
213
- # Build Gradio interface
214
- # ------------------------------------------------------------------
215
- def build_interface():
216
- with gr.Blocks() as demo:
217
- gr.Markdown("# AI-Powered Job Search (Gradio with Like/Dislike)")
218
-
219
- with gr.Row():
220
- resume_input = gr.File(label="Upload your resume (PDF, DOCX, DOC, or TXT)")
221
- num_results_slider = gr.Slider(
222
- minimum=1, maximum=MAX_RESULTS, value=5,
223
- step=1, label="Number of results"
224
- )
225
- city_input = gr.Textbox(
226
- label="Filter by city (optional)",
227
- placeholder="Enter a city to filter job results by location"
228
- )
229
-
230
- search_button = gr.Button("Search Jobs")
231
- results_markdown = gr.Markdown()
232
-
233
- # We create up to MAX_RESULTS rows for like/dislike
234
- # Each row has two buttons that map to user_interaction
235
- # We'll label them with the index so we can pass it to user_interaction
236
- output_messages = []
237
- for i in range(MAX_RESULTS):
238
- with gr.Row(visible=True) as row_i:
239
- # Each row: "Like" & "Dislike"
240
- btn_like = gr.Button(f"Like #{i+1}", variant="secondary", visible=True)
241
- btn_dislike = gr.Button(f"Dislike #{i+1}", variant="secondary", visible=True)
242
-
243
- # user_interaction callback => returns a small message
244
- msg = gr.Markdown(visible=True)
245
- output_messages.append(msg)
246
-
247
- # Wire the buttons to user_interaction
248
- # We pass:
249
- # - The index in the JOBS_CACHE
250
- # - The literal string 'like' or 'dislike'
251
- # The function returns a small text update
252
- btn_like.click(
253
- fn=user_interaction,
254
- inputs=[gr.State(i), gr.State("like")],
255
- outputs=[msg]
256
- )
257
- btn_dislike.click(
258
- fn=user_interaction,
259
- inputs=[gr.State(i), gr.State("dislike")],
260
- outputs=[msg]
261
- )
262
-
263
- # On search click => call search_jobs
264
- # outputs => results_markdown (which displays the job list)
265
- search_button.click(
266
- fn=search_jobs,
267
- inputs=[resume_input, num_results_slider, city_input],
268
- outputs=[results_markdown]
269
- )
270
-
271
- return demo
272
-
273
-
274
- if __name__ == "__main__":
275
- demo_app = build_interface()
276
- demo_app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
gradioapp.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import PyPDF2
3
+ import docx2txt
4
+ from typing import Optional, List, Dict
5
+ import re
6
+ from pinecone_handler import PineconeHandler
7
+ from datetime import datetime
8
+ import sqlite3
9
+ import threading
10
+
11
+ class Database:
12
+ def __init__(self, db_name="feedback.db"):
13
+ self.db_name = db_name
14
+ self.thread_local = threading.local()
15
+ self._create_tables()
16
+
17
+ def get_connection(self):
18
+ if not hasattr(self.thread_local, "connection"):
19
+ self.thread_local.connection = sqlite3.connect(self.db_name)
20
+ return self.thread_local.connection
21
+
22
+ def _create_tables(self):
23
+ conn = sqlite3.connect(self.db_name)
24
+ cursor = conn.cursor()
25
+ cursor.execute('''
26
+ CREATE TABLE IF NOT EXISTS feedback (
27
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
28
+ job_id TEXT,
29
+ resume_text TEXT,
30
+ job_headline TEXT,
31
+ job_occupation TEXT,
32
+ job_description TEXT,
33
+ is_relevant BOOLEAN,
34
+ timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
35
+ )
36
+ ''')
37
+ conn.commit()
38
+ conn.close()
39
+
40
+ def save_feedback(self, job_id: str, resume_text: str, headline: str,
41
+ occupation: str, description: str, is_relevant: bool):
42
+ conn = self.get_connection()
43
+ cursor = conn.cursor()
44
+ try:
45
+ cursor.execute('''
46
+ INSERT INTO feedback
47
+ (job_id, resume_text, job_headline, job_occupation, job_description, is_relevant)
48
+ VALUES (?, ?, ?, ?, ?, ?)
49
+ ''', (job_id, resume_text, headline, occupation, description, is_relevant))
50
+ conn.commit()
51
+ except Exception as e:
52
+ conn.rollback()
53
+ raise e
54
+
55
+ def extract_text(file) -> Optional[str]:
56
+ """Extract text from uploaded resume file"""
57
+ if not file:
58
+ return None
59
+
60
+ try:
61
+ file_type = file.name.split('.')[-1].lower()
62
+
63
+ if file_type == 'pdf':
64
+ pdf_reader = PyPDF2.PdfReader(file)
65
+ return "\n".join(page.extract_text() for page in pdf_reader.pages)
66
+
67
+ elif file_type in ['docx', 'doc']:
68
+ return docx2txt.process(file)
69
+
70
+ elif file_type == 'txt':
71
+ return str(file.read(), "utf-8")
72
+
73
+ else:
74
+ return f"Unsupported file format: {file_type}"
75
+ except Exception as e:
76
+ return f"Error processing file: {str(e)}"
77
+
78
+ class JobMatcher:
79
+ def __init__(self):
80
+ self.handler = PineconeHandler()
81
+ self.db = Database()
82
+ self.current_results = []
83
+ self.current_resume_text = None
84
+
85
+ def search_jobs(self, file, num_results: int, city: str = "") -> List[Dict]:
86
+ """Search for matching jobs and return results"""
87
+ if not file:
88
+ return [{"error": "Please upload a resume file."}]
89
+
90
+ try:
91
+ resume_text = extract_text(file)
92
+ if not resume_text:
93
+ return [{"error": "Could not extract text from resume."}]
94
+
95
+ self.current_resume_text = resume_text
96
+ resume_text = re.sub(r'\s+', ' ', resume_text).strip()
97
+
98
+ # Get results from Pinecone
99
+ results = self.handler.search_similar_ads(resume_text, top_k=num_results, city=city.strip())
100
+
101
+ if not results:
102
+ return [{"error": "No matching jobs found. Try adjusting your search criteria."}]
103
+
104
+ # Store results with their Pinecone IDs
105
+ self.current_results = [
106
+ {
107
+ 'id': result.id, # Use Pinecone's ID
108
+ 'score': result.score,
109
+ 'metadata': result.metadata
110
+ }
111
+ for result in results
112
+ ]
113
+
114
+ return self.current_results
115
+
116
+ except Exception as e:
117
+ return [{"error": f"Error: {str(e)}"}]
118
+
119
+ def submit_feedback(self, pinecone_id: str, is_relevant: bool) -> str:
120
+ """Submit feedback for a specific job using Pinecone ID"""
121
+ try:
122
+ # Find the job in current results by Pinecone ID
123
+ job = next((job for job in self.current_results if job['id'] == pinecone_id), None)
124
+
125
+ if not job:
126
+ return "Error: Job not found"
127
+
128
+ metadata = job['metadata']
129
+
130
+ self.db.save_feedback(
131
+ job_id=pinecone_id, # Use Pinecone's ID
132
+ resume_text=self.current_resume_text,
133
+ headline=metadata['headline'],
134
+ occupation=metadata['occupation'],
135
+ description=metadata['description'],
136
+ is_relevant=is_relevant
137
+ )
138
+ return f"βœ“ Feedback saved for '{metadata['headline']}'"
139
+ except Exception as e:
140
+ return f"Error saving feedback: {str(e)}"
141
+
142
+ def create_interface():
143
+ matcher = JobMatcher()
144
+
145
+ with gr.Blocks() as interface:
146
+ gr.Markdown("# AI-Powered Job Search")
147
+
148
+ with gr.Row():
149
+ file_input = gr.File(label="Upload Resume (PDF, DOCX, or TXT)")
150
+ num_results = gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Number of Results")
151
+ city_input = gr.Textbox(label="Filter by City (Optional)")
152
+
153
+ search_btn = gr.Button("Search Jobs")
154
+ status = gr.Textbox(label="Status", interactive=False)
155
+
156
+ # Container for job results and feedback buttons
157
+ job_containers = []
158
+ for i in range(20): # Support up to 20 results
159
+ with gr.Column(visible=False) as container:
160
+ job_content = gr.Markdown("", elem_id=f"job_content_{i}")
161
+ with gr.Row():
162
+ relevant_btn = gr.Button("πŸ‘ Relevant", elem_id=f"relevant_{i}")
163
+ not_relevant_btn = gr.Button("πŸ‘Ž Not Relevant", elem_id=f"not_relevant_{i}")
164
+ feedback_status = gr.Markdown("")
165
+ job_containers.append({
166
+ 'container': container,
167
+ 'content': job_content,
168
+ 'feedback_status': feedback_status,
169
+ 'pinecone_id': None # Will store Pinecone ID for each job
170
+ })
171
+
172
+ def update_job_displays(file, num_results, city):
173
+ results = matcher.search_jobs(file, num_results, city)
174
+
175
+ if "error" in results[0]:
176
+ return ([gr.update(visible=False)] * 20) + [results[0]["error"]]
177
+
178
+ updates = []
179
+ for i in range(20):
180
+ if i < len(results):
181
+ job = results[i]
182
+ metadata = job['metadata']
183
+
184
+ # Store Pinecone ID for this container
185
+ job_containers[i]['pinecone_id'] = job['id']
186
+
187
+ content = f"""
188
+ ### {metadata['headline']}
189
+ **Match Score:** {job['score']:.2f}
190
+ **Location:** {metadata['city']}
191
+ **Occupation:** {metadata['occupation']}
192
+ **Published:** {metadata['published']}
193
+
194
+ {metadata['description'][:500]}...
195
+
196
+ **Contact:** {metadata.get('email', 'Not provided')}
197
+ **More Info:** {metadata.get('webpage_url', 'Not available')}
198
+
199
+ *Job ID: {job['id']}*
200
+ """
201
+ updates.extend([
202
+ gr.update(visible=True), # Container visibility
203
+ content, # Job content
204
+ "" # Reset feedback status
205
+ ])
206
+ else:
207
+ updates.extend([
208
+ gr.update(visible=False),
209
+ "",
210
+ ""
211
+ ])
212
+
213
+ updates.append("Jobs found! Rate them as relevant or not relevant.")
214
+ return updates
215
+
216
+ def handle_feedback(container_index: int, is_relevant: bool):
217
+ pinecone_id = job_containers[container_index]['pinecone_id']
218
+ if pinecone_id:
219
+ response = matcher.submit_feedback(pinecone_id, is_relevant)
220
+ return response
221
+ return "Error: Job ID not found"
222
+
223
+ # Connect search button
224
+ all_outputs = []
225
+ for container in job_containers:
226
+ all_outputs.extend([
227
+ container['container'],
228
+ container['content'],
229
+ container['feedback_status']
230
+ ])
231
+ all_outputs.append(status)
232
+
233
+ search_btn.click(
234
+ fn=update_job_displays,
235
+ inputs=[file_input, num_results, city_input],
236
+ outputs=all_outputs
237
+ )
238
+
239
+ # Connect feedback buttons for each container
240
+ for i, container in enumerate(job_containers):
241
+ container_obj = container['container']
242
+ feedback_status = container['feedback_status']
243
+
244
+ # Get the buttons from the container
245
+ relevant_btn = container_obj.children[1].children[0]
246
+ not_relevant_btn = container_obj.children[1].children[1]
247
+
248
+ relevant_btn.click(
249
+ fn=lambda idx=i: handle_feedback(idx, True),
250
+ outputs=[feedback_status]
251
+ )
252
+ not_relevant_btn.click(
253
+ fn=lambda idx=i: handle_feedback(idx, False),
254
+ outputs=[feedback_status]
255
+ )
256
+
257
+ return interface
258
+
259
+ if __name__ == "__main__":
260
+ interface = create_interface()
261
+ interface.launch()