siddhartharya commited on
Commit
36ec703
Β·
verified Β·
1 Parent(s): 05de921

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +828 -113
app.py CHANGED
@@ -1,158 +1,873 @@
1
- import os
2
- import time
3
- import threading
4
- import requests
5
  from bs4 import BeautifulSoup
6
  from sentence_transformers import SentenceTransformer
7
  import faiss
8
  import numpy as np
9
- import gradio as gr
10
- from concurrent.futures import ThreadPoolExecutor
 
 
11
  import logging
 
 
 
 
 
 
 
 
12
 
13
- # Suppress warnings from urllib3
14
  import urllib3
15
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
16
 
17
- # Logging setup
18
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
  logger = logging.getLogger(__name__)
 
20
 
21
- # Environment variable keys for API access
22
- GROQ_API_KEY_BASIC = os.getenv('GROQ_API_KEY_BASIC')
23
- GROQ_API_KEY_ADVANCED = os.getenv('GROQ_API_KEY_ADVANCED')
24
 
25
- # LLM Models
26
- MODEL_BASIC = 'llama-3.1-8b-instant'
27
- MODEL_ADVANCED = 'llama-3.1-70b-versatile'
28
 
29
- # Verify API keys
30
- if not GROQ_API_KEY_BASIC or not GROQ_API_KEY_ADVANCED:
31
- logger.error("Both GROQ_API_KEY_BASIC and GROQ_API_KEY_ADVANCED must be set.")
32
- exit()
33
 
34
- # Embedding model and FAISS index initialization
 
35
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
36
  faiss_index = None
37
  bookmarks = []
 
 
 
 
38
 
39
- # Define categories
40
  CATEGORIES = [
41
- "Social Media", "News and Media", "Education and Learning", "Entertainment",
42
- "Shopping and E-commerce", "Finance and Banking", "Technology", "Health and Fitness",
43
- "Travel and Tourism", "Food and Recipes", "Sports", "Arts and Culture",
44
- "Government and Politics", "Business and Economy", "Science and Research",
45
- "Personal Blogs and Journals", "Job Search and Careers", "Music and Audio",
46
- "Videos and Movies", "Reference and Knowledge Bases", "Dead Link", "Uncategorized"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  ]
48
 
49
- # Task routing logic
50
- def select_model_for_task(content_length):
51
- """Choose LLM model based on task complexity."""
52
- if content_length < 500: # Simple tasks
53
- return GROQ_API_KEY_BASIC, MODEL_BASIC
54
- else: # Complex tasks
55
- return GROQ_API_KEY_ADVANCED, MODEL_ADVANCED
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Fetch URL info function
58
  def fetch_url_info(bookmark):
 
 
 
 
 
 
 
 
 
59
  try:
60
- response = requests.get(bookmark['url'], timeout=10, verify=False)
61
- bookmark['html_content'] = response.text
 
 
 
 
 
62
  bookmark['status_code'] = response.status_code
63
- except Exception as e:
64
- logger.error(f"Failed to fetch URL info for {bookmark['url']}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  bookmark['html_content'] = ''
 
 
 
 
 
66
  bookmark['status_code'] = 'Error'
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # Generate summary and assign category
69
- def generate_summary_and_assign_category(bookmark):
70
- content_length = len(bookmark.get('html_content', ''))
71
- api_key, model_name = select_model_for_task(content_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # Prepare the prompt
74
- prompt = f"""
75
- You are an assistant. Summarize the following webpage content:
76
- {bookmark.get('html_content', '')}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
- Assign one category from this list: {', '.join(CATEGORIES)}.
 
 
 
 
79
 
80
- Respond in the format:
81
- Summary: [Your summary]
82
- Category: [One category]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
 
 
 
 
 
 
 
 
84
 
85
  try:
86
- response = requests.post(
87
- f"https://api.openai.com/v1/chat/completions",
88
- headers={"Authorization": f"Bearer {api_key}"},
89
- json={
90
- "model": model_name,
91
- "messages": [{"role": "user", "content": prompt}],
92
- "max_tokens": 150,
93
- "temperature": 0.7,
94
- },
95
- )
96
- result = response.json()
97
- content = result['choices'][0]['message']['content']
98
-
99
- # Extract summary and category
100
- summary_start = content.find("Summary:")
101
- category_start = content.find("Category:")
102
- bookmark['summary'] = content[summary_start + 9:category_start].strip()
103
- bookmark['category'] = content[category_start + 9:].strip()
104
  except Exception as e:
105
- logger.error(f"Error processing LLM response for {bookmark['url']}: {e}")
106
- bookmark['summary'] = 'No summary available.'
107
- bookmark['category'] = 'Uncategorized'
108
 
109
- # Vectorize summaries and build FAISS index
110
- def vectorize_and_index(bookmarks):
111
- global faiss_index
112
- summaries = [b['summary'] for b in bookmarks]
113
- embeddings = embedding_model.encode(summaries)
114
- dimension = embeddings.shape[1]
115
- index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
116
- ids = np.arange(len(bookmarks))
117
- index.add_with_ids(embeddings, ids)
118
- faiss_index = index
119
-
120
- # Gradio interface setup
121
- def process_bookmarks(file):
122
- global bookmarks
123
- file_content = file.read().decode('utf-8')
124
- soup = BeautifulSoup(file_content, 'html.parser')
125
-
126
- # Parse bookmarks
127
- bookmarks = [
128
- {'url': link.get('href'), 'title': link.text, 'html_content': ''}
129
- for link in soup.find_all('a') if link.get('href')
130
- ]
131
-
132
- # Fetch URLs concurrently
133
- with ThreadPoolExecutor() as executor:
134
  executor.map(fetch_url_info, bookmarks)
135
 
136
- # Process bookmarks with LLM
137
- with ThreadPoolExecutor() as executor:
 
138
  executor.map(generate_summary_and_assign_category, bookmarks)
139
 
140
- # Build FAISS index
141
- vectorize_and_index(bookmarks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- return bookmarks
 
 
144
 
145
- # Build Gradio app
146
- with gr.Blocks() as demo:
147
- gr.Markdown("# Smart Bookmark Manager")
148
- file_input = gr.File(label="Upload Bookmark File", type="binary")
149
- submit_button = gr.Button("Process")
150
- output = gr.Textbox(label="Output")
151
 
152
- def handle_submit(file):
153
- processed = process_bookmarks(file)
154
- return "\n".join([f"{b['title']} - {b['category']}" for b in processed])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
- submit_button.click(handle_submit, inputs=file_input, outputs=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
- demo.launch()
 
 
1
+ # app.py
2
+
3
+ import gradio as gr
 
4
  from bs4 import BeautifulSoup
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
8
+ import requests
9
+ import time
10
+ import re
11
+ import base64
12
  import logging
13
+ import os
14
+ import sys
15
+ import concurrent.futures
16
+ from concurrent.futures import ThreadPoolExecutor
17
+ import threading
18
+
19
+ # Import OpenAI library
20
+ import openai
21
 
22
+ # Suppress only the single warning from urllib3 needed.
23
  import urllib3
24
  urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
25
 
26
+ # Set up logging to output to the console
 
27
  logger = logging.getLogger(__name__)
28
+ logger.setLevel(logging.INFO)
29
 
30
+ # Create a console handler
31
+ console_handler = logging.StreamHandler(sys.stdout)
32
+ console_handler.setLevel(logging.INFO)
33
 
34
+ # Create a formatter and set it for the handler
35
+ formatter = logging.Formatter('%(asctime)s %(levelname)s %(name)s %(message)s')
36
+ console_handler.setFormatter(formatter)
37
 
38
+ # Add the handler to the logger
39
+ logger.addHandler(console_handler)
 
 
40
 
41
+ # Initialize variables and models
42
+ logger.info("Initializing variables and models")
43
  embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
44
  faiss_index = None
45
  bookmarks = []
46
+ fetch_cache = {}
47
+
48
+ # Lock for thread-safe operations
49
+ lock = threading.Lock()
50
 
51
+ # Define the categories
52
  CATEGORIES = [
53
+ "Social Media",
54
+ "News and Media",
55
+ "Education and Learning",
56
+ "Entertainment",
57
+ "Shopping and E-commerce",
58
+ "Finance and Banking",
59
+ "Technology",
60
+ "Health and Fitness",
61
+ "Travel and Tourism",
62
+ "Food and Recipes",
63
+ "Sports",
64
+ "Arts and Culture",
65
+ "Government and Politics",
66
+ "Business and Economy",
67
+ "Science and Research",
68
+ "Personal Blogs and Journals",
69
+ "Job Search and Careers",
70
+ "Music and Audio",
71
+ "Videos and Movies",
72
+ "Reference and Knowledge Bases",
73
+ "Dead Link",
74
+ "Uncategorized",
75
  ]
76
 
77
+ # Set up Groq Cloud API key and base URL
78
+ GROQ_API_KEY = os.getenv('GROQ_API_KEY')
79
+
80
+ if not GROQ_API_KEY:
81
+ logger.error("GROQ_API_KEY environment variable not set.")
82
+
83
+ openai.api_key = GROQ_API_KEY
84
+ openai.api_base = "https://api.groq.com/openai/v1"
85
+
86
+ # Initialize global variables for rate limiting
87
+ api_lock = threading.Lock()
88
+ last_api_call_time = 0
89
+
90
+ def extract_main_content(soup):
91
+ """
92
+ Extract the main content from a webpage while filtering out boilerplate content.
93
+ """
94
+ if not soup:
95
+ return ""
96
+
97
+ # Remove unwanted elements
98
+ for element in soup(['script', 'style', 'header', 'footer', 'nav', 'aside', 'form', 'noscript']):
99
+ element.decompose()
100
+
101
+ # Extract text from <p> tags
102
+ p_tags = soup.find_all('p')
103
+ if p_tags:
104
+ content = ' '.join([p.get_text(strip=True, separator=' ') for p in p_tags])
105
+ else:
106
+ # Fallback to body content
107
+ content = soup.get_text(separator=' ', strip=True)
108
+
109
+ # Clean up the text
110
+ content = re.sub(r'\s+', ' ', content)
111
+
112
+ # Truncate content to a reasonable length (e.g., 1500 words)
113
+ words = content.split()
114
+ if len(words) > 1500:
115
+ content = ' '.join(words[:1500])
116
+
117
+ return content
118
+
119
+ def get_page_metadata(soup):
120
+ """
121
+ Extract metadata from the webpage including title, description, and keywords.
122
+ """
123
+ metadata = {
124
+ 'title': '',
125
+ 'description': '',
126
+ 'keywords': ''
127
+ }
128
+
129
+ if not soup:
130
+ return metadata
131
+
132
+ # Get title
133
+ title_tag = soup.find('title')
134
+ if title_tag and title_tag.string:
135
+ metadata['title'] = title_tag.string.strip()
136
+
137
+ # Get meta description
138
+ meta_desc = (
139
+ soup.find('meta', attrs={'name': 'description'}) or
140
+ soup.find('meta', attrs={'property': 'og:description'}) or
141
+ soup.find('meta', attrs={'name': 'twitter:description'})
142
+ )
143
+ if meta_desc:
144
+ metadata['description'] = meta_desc.get('content', '').strip()
145
+
146
+ # Get meta keywords
147
+ meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
148
+ if meta_keywords:
149
+ metadata['keywords'] = meta_keywords.get('content', '').strip()
150
+
151
+ # Get OG title if main title is empty
152
+ if not metadata['title']:
153
+ og_title = soup.find('meta', attrs={'property': 'og:title'})
154
+ if og_title:
155
+ metadata['title'] = og_title.get('content', '').strip()
156
+
157
+ return metadata
158
+ def generate_summary_and_assign_category(bookmark):
159
+ """
160
+ Generate a concise summary and assign a category using a single LLM call.
161
+ """
162
+ logger.info(f"Generating summary and assigning category for bookmark: {bookmark.get('url')}")
163
+
164
+ max_retries = 3
165
+ retry_count = 0
166
+
167
+ while retry_count < max_retries:
168
+ try:
169
+ # Rate Limiting Logic
170
+ with api_lock:
171
+ global last_api_call_time
172
+ current_time = time.time()
173
+ elapsed = current_time - last_api_call_time
174
+ if elapsed < 2:
175
+ sleep_duration = 2 - elapsed
176
+ logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
177
+ time.sleep(sleep_duration)
178
+ last_api_call_time = time.time()
179
+
180
+ html_content = bookmark.get('html_content', '')
181
+ soup = BeautifulSoup(html_content, 'html.parser')
182
+ metadata = get_page_metadata(soup)
183
+ main_content = extract_main_content(soup)
184
+
185
+ # Prepare content for the prompt
186
+ content_parts = []
187
+ if metadata['title']:
188
+ content_parts.append(f"Title: {metadata['title']}")
189
+ if metadata['description']:
190
+ content_parts.append(f"Description: {metadata['description']}")
191
+ if metadata['keywords']:
192
+ content_parts.append(f"Keywords: {metadata['keywords']}")
193
+ if main_content:
194
+ content_parts.append(f"Main Content: {main_content}")
195
+
196
+ content_text = '\n'.join(content_parts)
197
+
198
+ # Detect insufficient or erroneous content
199
+ error_keywords = ['Access Denied', 'Security Check', 'Cloudflare', 'captcha', 'unusual traffic']
200
+ if not content_text or len(content_text.split()) < 50:
201
+ use_prior_knowledge = True
202
+ logger.info(f"Content for {bookmark.get('url')} is insufficient. Instructing LLM to use prior knowledge.")
203
+ elif any(keyword.lower() in content_text.lower() for keyword in error_keywords):
204
+ use_prior_knowledge = True
205
+ logger.info(f"Content for {bookmark.get('url')} contains error messages. Instructing LLM to use prior knowledge.")
206
+ else:
207
+ use_prior_knowledge = False
208
+
209
+ if use_prior_knowledge:
210
+ prompt = f"""
211
+ You are a knowledgeable assistant with up-to-date information as of 2023.
212
+ URL: {bookmark.get('url')}
213
+ Provide:
214
+ 1. A concise summary (max two sentences) about this website.
215
+ 2. Assign the most appropriate category from the list below.
216
+ Categories:
217
+ {', '.join([f'"{cat}"' for cat in CATEGORIES])}
218
+ Format:
219
+ Summary: [Your summary]
220
+ Category: [One category]
221
+ """
222
+ else:
223
+ prompt = f"""
224
+ You are an assistant that creates concise webpage summaries and assigns categories.
225
+ Content:
226
+ {content_text}
227
+ Provide:
228
+ 1. A concise summary (max two sentences) focusing on the main topic.
229
+ 2. Assign the most appropriate category from the list below.
230
+ Categories:
231
+ {', '.join([f'"{cat}"' for cat in CATEGORIES])}
232
+ Format:
233
+ Summary: [Your summary]
234
+ Category: [One category]
235
+ """
236
+
237
+ def estimate_tokens(text):
238
+ return len(text) / 4
239
+
240
+ prompt_tokens = estimate_tokens(prompt)
241
+ max_tokens = 150
242
+ total_tokens = prompt_tokens + max_tokens
243
+
244
+ tokens_per_minute = 40000
245
+ tokens_per_second = tokens_per_minute / 60
246
+ required_delay = total_tokens / tokens_per_second
247
+ sleep_time = max(required_delay, 2)
248
+
249
+ response = openai.ChatCompletion.create(
250
+ model='llama-3.1-70b-versatile',
251
+ messages=[
252
+ {"role": "user", "content": prompt}
253
+ ],
254
+ max_tokens=int(max_tokens),
255
+ temperature=0.5,
256
+ )
257
+
258
+ content = response['choices'][0]['message']['content'].strip()
259
+ if not content:
260
+ raise ValueError("Empty response received from the model.")
261
+
262
+ summary_match = re.search(r"Summary:\s*(.*)", content)
263
+ category_match = re.search(r"Category:\s*(.*)", content)
264
+
265
+ if summary_match:
266
+ bookmark['summary'] = summary_match.group(1).strip()
267
+ else:
268
+ bookmark['summary'] = 'No summary available.'
269
+
270
+ if category_match:
271
+ category = category_match.group(1).strip().strip('"')
272
+ if category in CATEGORIES:
273
+ bookmark['category'] = category
274
+ else:
275
+ bookmark['category'] = 'Uncategorized'
276
+ else:
277
+ bookmark['category'] = 'Uncategorized'
278
+
279
+ # Simple keyword-based validation
280
+ summary_lower = bookmark['summary'].lower()
281
+ url_lower = bookmark['url'].lower()
282
+ if 'social media' in summary_lower or 'twitter' in summary_lower or 'x.com' in url_lower:
283
+ bookmark['category'] = 'Social Media'
284
+ elif 'wikipedia' in url_lower:
285
+ bookmark['category'] = 'Reference and Knowledge Bases'
286
+
287
+ logger.info("Successfully generated summary and assigned category")
288
+ time.sleep(sleep_time)
289
+ break
290
+
291
+ except openai.error.RateLimitError as e:
292
+ retry_count += 1
293
+ wait_time = int(e.headers.get("Retry-After", 5))
294
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying... (Attempt {retry_count}/{max_retries})")
295
+ time.sleep(wait_time)
296
+ except Exception as e:
297
+ logger.error(f"Error generating summary and assigning category: {e}", exc_info=True)
298
+ bookmark['summary'] = 'No summary available.'
299
+ bookmark['category'] = 'Uncategorized'
300
+ break
301
+
302
+ def parse_bookmarks(file_content):
303
+ """
304
+ Parse bookmarks from HTML file.
305
+ """
306
+ logger.info("Parsing bookmarks")
307
+ try:
308
+ soup = BeautifulSoup(file_content, 'html.parser')
309
+ extracted_bookmarks = []
310
+ for link in soup.find_all('a'):
311
+ url = link.get('href')
312
+ title = link.text.strip()
313
+ if url and title:
314
+ if url.startswith('http://') or url.startswith('https://'):
315
+ extracted_bookmarks.append({'url': url, 'title': title})
316
+ else:
317
+ logger.info(f"Skipping non-http/https URL: {url}")
318
+ logger.info(f"Extracted {len(extracted_bookmarks)} bookmarks")
319
+ return extracted_bookmarks
320
+ except Exception as e:
321
+ logger.error("Error parsing bookmarks: %s", e, exc_info=True)
322
+ raise
323
 
 
324
  def fetch_url_info(bookmark):
325
+ """
326
+ Fetch information about a URL.
327
+ """
328
+ url = bookmark['url']
329
+ if url in fetch_cache:
330
+ with lock:
331
+ bookmark.update(fetch_cache[url])
332
+ return
333
+
334
  try:
335
+ logger.info(f"Fetching URL info for: {url}")
336
+ headers = {
337
+ 'User-Agent': 'Mozilla/5.0',
338
+ 'Accept-Language': 'en-US,en;q=0.9',
339
+ }
340
+ response = requests.get(url, headers=headers, timeout=5, verify=False, allow_redirects=True)
341
+ bookmark['etag'] = response.headers.get('ETag', 'N/A')
342
  bookmark['status_code'] = response.status_code
343
+
344
+ content = response.text
345
+ logger.info(f"Fetched content length for {url}: {len(content)} characters")
346
+
347
+ if response.status_code >= 500:
348
+ bookmark['dead_link'] = True
349
+ bookmark['description'] = ''
350
+ bookmark['html_content'] = ''
351
+ logger.warning(f"Dead link detected: {url} with status {response.status_code}")
352
+ else:
353
+ bookmark['dead_link'] = False
354
+ bookmark['html_content'] = content
355
+ bookmark['description'] = ''
356
+ logger.info(f"Fetched information for {url}")
357
+
358
+ except requests.exceptions.Timeout:
359
+ bookmark['dead_link'] = False
360
+ bookmark['etag'] = 'N/A'
361
+ bookmark['status_code'] = 'Timeout'
362
+ bookmark['description'] = ''
363
  bookmark['html_content'] = ''
364
+ bookmark['slow_link'] = True
365
+ logger.warning(f"Timeout while fetching {url}. Marking as 'Slow'.")
366
+ except Exception as e:
367
+ bookmark['dead_link'] = True
368
+ bookmark['etag'] = 'N/A'
369
  bookmark['status_code'] = 'Error'
370
+ bookmark['description'] = ''
371
+ bookmark['html_content'] = ''
372
+ logger.error(f"Error fetching URL info for {url}: {e}", exc_info=True)
373
+ finally:
374
+ with lock:
375
+ fetch_cache[url] = {
376
+ 'etag': bookmark.get('etag'),
377
+ 'status_code': bookmark.get('status_code'),
378
+ 'dead_link': bookmark.get('dead_link'),
379
+ 'description': bookmark.get('description'),
380
+ 'html_content': bookmark.get('html_content', ''),
381
+ 'slow_link': bookmark.get('slow_link', False),
382
+ }
383
 
384
+ def vectorize_and_index(bookmarks_list):
385
+ """
386
+ Create vector embeddings for bookmarks and build FAISS index with ID mapping.
387
+ """
388
+ global faiss_index
389
+ logger.info("Vectorizing summaries and building FAISS index")
390
+ try:
391
+ summaries = [bookmark['summary'] for bookmark in bookmarks_list]
392
+ embeddings = embedding_model.encode(summaries)
393
+ dimension = embeddings.shape[1]
394
+ index = faiss.IndexIDMap(faiss.IndexFlatL2(dimension))
395
+ ids = np.array([bookmark['id'] for bookmark in bookmarks_list], dtype=np.int64)
396
+ index.add_with_ids(np.array(embeddings).astype('float32'), ids)
397
+ faiss_index = index
398
+ logger.info("FAISS index built successfully with IDs")
399
+ return index
400
+ except Exception as e:
401
+ logger.error(f"Error in vectorizing and indexing: {e}", exc_info=True)
402
+ raise
403
 
404
+ def display_bookmarks():
405
+ """
406
+ Generate HTML display for bookmarks.
407
+ """
408
+ logger.info("Generating HTML display for bookmarks")
409
+ cards = ''
410
+ for i, bookmark in enumerate(bookmarks):
411
+ index = i + 1
412
+ if bookmark.get('dead_link'):
413
+ status = "❌ Dead Link"
414
+ card_style = "border: 2px solid red;"
415
+ text_style = "color: white;"
416
+ elif bookmark.get('slow_link'):
417
+ status = "⏳ Slow Response"
418
+ card_style = "border: 2px solid orange;"
419
+ text_style = "color: white;"
420
+ else:
421
+ status = "βœ… Active"
422
+ card_style = "border: 2px solid green;"
423
+ text_style = "color: white;"
424
 
425
+ title = bookmark['title']
426
+ url = bookmark['url']
427
+ etag = bookmark.get('etag', 'N/A')
428
+ summary = bookmark.get('summary', '')
429
+ category = bookmark.get('category', 'Uncategorized')
430
 
431
+ # Escape HTML content to prevent XSS attacks
432
+ from html import escape
433
+ title = escape(title)
434
+ url = escape(url)
435
+ summary = escape(summary)
436
+ category = escape(category)
437
+
438
+ card_html = f'''
439
+ <div class="card" style="{card_style} padding: 10px; margin: 10px; border-radius: 5px; background-color: #1e1e1e;">
440
+ <div class="card-content">
441
+ <h3 style="{text_style}">{index}. {title} {status}</h3>
442
+ <p style="{text_style}"><strong>Category:</strong> {category}</p>
443
+ <p style="{text_style}"><strong>URL:</strong> <a href="{url}" target="_blank" style="{text_style}">{url}</a></p>
444
+ <p style="{text_style}"><strong>ETag:</strong> {etag}</p>
445
+ <p style="{text_style}"><strong>Summary:</strong> {summary}</p>
446
+ </div>
447
+ </div>
448
+ '''
449
+ cards += card_html
450
+ logger.info("HTML display generated")
451
+ return cards
452
+
453
+ def process_uploaded_file(file, state_bookmarks):
454
  """
455
+ Process the uploaded bookmarks file.
456
+ """
457
+ global bookmarks, faiss_index
458
+ logger.info("Processing uploaded file")
459
+
460
+ if file is None:
461
+ logger.warning("No file uploaded")
462
+ return "Please upload a bookmarks HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
463
 
464
  try:
465
+ file_content = file.decode('utf-8')
466
+ except UnicodeDecodeError as e:
467
+ logger.error(f"Error decoding the file: {e}", exc_info=True)
468
+ return "Error decoding the file. Please ensure it's a valid HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
469
+
470
+ try:
471
+ bookmarks = parse_bookmarks(file_content)
 
 
 
 
 
 
 
 
 
 
 
472
  except Exception as e:
473
+ logger.error(f"Error parsing bookmarks: {e}", exc_info=True)
474
+ return "Error parsing the bookmarks HTML file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
 
475
 
476
+ if not bookmarks:
477
+ logger.warning("No bookmarks found in the uploaded file")
478
+ return "No bookmarks found in the uploaded file.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
479
+
480
+ # Assign unique IDs to bookmarks
481
+ for idx, bookmark in enumerate(bookmarks):
482
+ bookmark['id'] = idx
483
+
484
+ # Fetch bookmark info concurrently
485
+ logger.info("Fetching URL info concurrently")
486
+ with ThreadPoolExecutor(max_workers=10) as executor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  executor.map(fetch_url_info, bookmarks)
488
 
489
+ # Process bookmarks concurrently with LLM calls
490
+ logger.info("Processing bookmarks with LLM concurrently")
491
+ with ThreadPoolExecutor(max_workers=1) as executor:
492
  executor.map(generate_summary_and_assign_category, bookmarks)
493
 
494
+ try:
495
+ faiss_index = vectorize_and_index(bookmarks)
496
+ except Exception as e:
497
+ logger.error(f"Error building FAISS index: {e}", exc_info=True)
498
+ return "Error building search index.", '', state_bookmarks, display_bookmarks(), gr.update(choices=[])
499
+
500
+ message = f"βœ… Successfully processed {len(bookmarks)} bookmarks."
501
+ logger.info(message)
502
+
503
+ # Generate displays and updates
504
+ bookmark_html = display_bookmarks()
505
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
506
+ for i, bookmark in enumerate(bookmarks)]
507
+
508
+ # Update state
509
+ state_bookmarks = bookmarks.copy()
510
+
511
+ return message, bookmark_html, state_bookmarks, bookmark_html, gr.update(choices=choices)
512
+
513
+ def delete_selected_bookmarks(selected_indices, state_bookmarks):
514
+ """
515
+ Delete selected bookmarks and remove their vectors from the FAISS index.
516
+ """
517
+ global bookmarks, faiss_index
518
+ if not selected_indices:
519
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks()
520
+
521
+ ids_to_delete = []
522
+ indices_to_delete = []
523
+ for s in selected_indices:
524
+ idx = int(s.split('.')[0]) - 1
525
+ if 0 <= idx < len(bookmarks):
526
+ bookmark_id = bookmarks[idx]['id']
527
+ ids_to_delete.append(bookmark_id)
528
+ indices_to_delete.append(idx)
529
+ logger.info(f"Deleting bookmark at index {idx + 1}")
530
+
531
+ # Remove vectors from FAISS index
532
+ if faiss_index is not None and ids_to_delete:
533
+ faiss_index.remove_ids(np.array(ids_to_delete, dtype=np.int64))
534
+
535
+ # Remove bookmarks from the list (reverse order to avoid index shifting)
536
+ for idx in sorted(indices_to_delete, reverse=True):
537
+ bookmarks.pop(idx)
538
+
539
+ message = "πŸ—‘οΈ Selected bookmarks deleted successfully."
540
+ logger.info(message)
541
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
542
+ for i, bookmark in enumerate(bookmarks)]
543
+
544
+ # Update state
545
+ state_bookmarks = bookmarks.copy()
546
+
547
+ return message, gr.update(choices=choices), display_bookmarks()
548
+
549
+ def edit_selected_bookmarks_category(selected_indices, new_category, state_bookmarks):
550
+ """
551
+ Edit category of selected bookmarks.
552
+ """
553
+ if not selected_indices:
554
+ return "⚠️ No bookmarks selected.", gr.update(choices=[]), display_bookmarks(), state_bookmarks
555
+ if not new_category:
556
+ return "⚠️ No new category selected.", gr.update(choices=[]), display_bookmarks(), state_bookmarks
557
+
558
+ indices = [int(s.split('.')[0])-1 for s in selected_indices]
559
+ for idx in indices:
560
+ if 0 <= idx < len(bookmarks):
561
+ bookmarks[idx]['category'] = new_category
562
+ logger.info(f"Updated category for bookmark {idx + 1} to {new_category}")
563
+
564
+ message = "✏️ Category updated for selected bookmarks."
565
+ logger.info(message)
566
 
567
+ # Update choices and display
568
+ choices = [f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
569
+ for i, bookmark in enumerate(bookmarks)]
570
 
571
+ # Update state
572
+ state_bookmarks = bookmarks.copy()
 
 
 
 
573
 
574
+ return message, gr.update(choices=choices), display_bookmarks(), state_bookmarks
575
+
576
+ def export_bookmarks():
577
+ """
578
+ Export bookmarks to an HTML file.
579
+ """
580
+ if not bookmarks:
581
+ logger.warning("No bookmarks to export")
582
+ return None
583
+
584
+ try:
585
+ logger.info("Exporting bookmarks to HTML")
586
+ soup = BeautifulSoup("<!DOCTYPE NETSCAPE-Bookmark-file-1><Title>Bookmarks</Title><H1>Bookmarks</H1>", 'html.parser')
587
+ dl = soup.new_tag('DL')
588
+ for bookmark in bookmarks:
589
+ dt = soup.new_tag('DT')
590
+ a = soup.new_tag('A', href=bookmark['url'])
591
+ a.string = bookmark['title']
592
+ dt.append(a)
593
+ dl.append(dt)
594
+ soup.append(dl)
595
+ html_content = str(soup)
596
+ output_file = "exported_bookmarks.html"
597
+ with open(output_file, 'w', encoding='utf-8') as f:
598
+ f.write(html_content)
599
+ logger.info("Bookmarks exported successfully")
600
+ return output_file
601
+ except Exception as e:
602
+ logger.error(f"Error exporting bookmarks: {e}", exc_info=True)
603
+ return None
604
+
605
+ def chatbot_response(user_query, chat_history):
606
+ """
607
+ Generate chatbot response using the FAISS index and embeddings.
608
+ """
609
+ if not bookmarks or faiss_index is None:
610
+ logger.warning("No bookmarks available for chatbot")
611
+ chat_history.append({"role": "assistant", "content": "⚠️ No bookmarks available. Please upload and process your bookmarks first."})
612
+ return chat_history
613
+
614
+ logger.info(f"Chatbot received query: {user_query}")
615
+
616
+ try:
617
+ chat_history.append({"role": "user", "content": user_query})
618
+
619
+ with api_lock:
620
+ global last_api_call_time
621
+ current_time = time.time()
622
+ elapsed = current_time - last_api_call_time
623
+ if elapsed < 2:
624
+ sleep_duration = 2 - elapsed
625
+ logger.info(f"Sleeping for {sleep_duration:.2f} seconds to respect rate limits.")
626
+ time.sleep(sleep_duration)
627
+ last_api_call_time = time.time()
628
+
629
+ query_vector = embedding_model.encode([user_query]).astype('float32')
630
+ k = 5
631
+ distances, ids = faiss_index.search(query_vector, k)
632
+ ids = ids.flatten()
633
+
634
+ id_to_bookmark = {bookmark['id']: bookmark for bookmark in bookmarks}
635
+ matching_bookmarks = [id_to_bookmark.get(id) for id in ids if id in id_to_bookmark]
636
+
637
+ if not matching_bookmarks:
638
+ answer = "No relevant bookmarks found for your query."
639
+ chat_history.append({"role": "assistant", "content": answer})
640
+ return chat_history
641
+
642
+ bookmarks_info = "\n".join([
643
+ f"Title: {bookmark['title']}\nURL: {bookmark['url']}\nSummary: {bookmark['summary']}"
644
+ for bookmark in matching_bookmarks
645
+ ])
646
+
647
+ prompt = f"""
648
+ A user asked: "{user_query}"
649
+ Based on the bookmarks below, provide a helpful answer to the user's query, referencing the relevant bookmarks.
650
+ Bookmarks:
651
+ {bookmarks_info}
652
+ Provide a concise and helpful response.
653
+ """
654
+
655
+ def estimate_tokens(text):
656
+ return len(text) / 4
657
+
658
+ prompt_tokens = estimate_tokens(prompt)
659
+ max_tokens = 300
660
+ total_tokens = prompt_tokens + max_tokens
661
+
662
+ tokens_per_minute = 40000
663
+ tokens_per_second = tokens_per_minute / 60
664
+ required_delay = total_tokens / tokens_per_second
665
+ sleep_time = max(required_delay, 2)
666
+
667
+ response = openai.ChatCompletion.create(
668
+ model='llama-3.1-70b-versatile',
669
+ messages=[
670
+ {"role": "user", "content": prompt}
671
+ ],
672
+ max_tokens=int(max_tokens),
673
+ temperature=0.7,
674
+ )
675
 
676
+ answer = response['choices'][0]['message']['content'].strip()
677
+ logger.info("Chatbot response generated")
678
+ time.sleep(sleep_time)
679
+
680
+ chat_history.append({"role": "assistant", "content": answer})
681
+ return chat_history
682
+
683
+ except openai.error.RateLimitError as e:
684
+ wait_time = int(e.headers.get("Retry-After", 5))
685
+ logger.warning(f"Rate limit reached. Waiting for {wait_time} seconds before retrying...")
686
+ time.sleep(wait_time)
687
+ return chatbot_response(user_query, chat_history)
688
+ except Exception as e:
689
+ error_message = f"⚠️ Error processing your query: {str(e)}"
690
+ logger.error(error_message, exc_info=True)
691
+ chat_history.append({"role": "assistant", "content": error_message})
692
+ return chat_history
693
+ def build_app():
694
+ """
695
+ Build and launch the Gradio app.
696
+ """
697
+ try:
698
+ logger.info("Building Gradio app")
699
+ with gr.Blocks(css="app.css") as demo:
700
+ # Initialize state
701
+ state_bookmarks = gr.State([])
702
+
703
+ # General Overview
704
+ gr.Markdown("""
705
+ # πŸ“š SmartMarks - AI Browser Bookmarks Manager
706
+
707
+ Welcome to **SmartMarks**, your intelligent assistant for managing browser bookmarks. SmartMarks leverages AI to help you organize, search, and interact with your bookmarks seamlessly.
708
+
709
+ ---
710
+
711
+ ## πŸš€ **How to Use SmartMarks**
712
+
713
+ SmartMarks is divided into three main sections:
714
+
715
+ 1. **πŸ“‚ Upload and Process Bookmarks:** Import your existing bookmarks and let SmartMarks analyze and categorize them for you.
716
+ 2. **πŸ’¬ Chat with Bookmarks:** Interact with your bookmarks using natural language queries to find relevant links effortlessly.
717
+ 3. **πŸ› οΈ Manage Bookmarks:** View, edit, delete, and export your bookmarks with ease.
718
+
719
+ Navigate through the tabs to explore each feature in detail.
720
+ """)
721
+
722
+ # Upload and Process Bookmarks Tab
723
+ with gr.Tab("Upload and Process Bookmarks"):
724
+ gr.Markdown("""
725
+ ## πŸ“‚ **Upload and Process Bookmarks**
726
+
727
+ ### πŸ“ **Steps to Upload and Process:**
728
+
729
+ 1. **Upload Bookmarks File:**
730
+ - Click on the **"πŸ“ Upload Bookmarks HTML File"** button.
731
+ - Select your browser's exported bookmarks HTML file from your device.
732
+
733
+ 2. **Process Bookmarks:**
734
+ - After uploading, click on the **"βš™οΈ Process Bookmarks"** button.
735
+ - SmartMarks will parse your bookmarks, fetch additional information, generate summaries, and categorize each link based on predefined categories.
736
+
737
+ 3. **View Processed Bookmarks:**
738
+ - Once processing is complete, your bookmarks will be displayed in an organized and visually appealing format below.
739
+ """)
740
+
741
+ upload = gr.File(label="πŸ“ Upload Bookmarks HTML File", type='binary')
742
+ process_button = gr.Button("βš™οΈ Process Bookmarks")
743
+ output_text = gr.Textbox(label="βœ… Output", interactive=False)
744
+ bookmark_display = gr.HTML(label="πŸ“„ Processed Bookmarks")
745
+
746
+ # Chat with Bookmarks Tab
747
+ with gr.Tab("Chat with Bookmarks"):
748
+ gr.Markdown("""
749
+ ## πŸ’¬ **Chat with Bookmarks**
750
+
751
+ ### πŸ€– **How to Interact:**
752
+
753
+ 1. **Enter Your Query:**
754
+ - In the **"✍️ Ask about your bookmarks"** textbox, type your question or keyword related to your bookmarks.
755
+
756
+ 2. **Submit Your Query:**
757
+ - Click the **"πŸ“¨ Send"** button to submit your query.
758
+
759
+ 3. **Receive AI-Driven Responses:**
760
+ - SmartMarks will analyze your query and provide relevant bookmarks that match your request.
761
+
762
+ 4. **View Chat History:**
763
+ - All your queries and the corresponding AI responses are displayed in the chat history.
764
+ """)
765
+
766
+ chatbot = gr.Chatbot(label="πŸ’¬ Chat with SmartMarks", type='messages')
767
+ user_input = gr.Textbox(
768
+ label="✍️ Ask about your bookmarks",
769
+ placeholder="e.g., Do I have any bookmarks about AI?"
770
+ )
771
+ chat_button = gr.Button("πŸ“¨ Send")
772
+
773
+ chat_button.click(
774
+ chatbot_response,
775
+ inputs=[user_input, chatbot],
776
+ outputs=chatbot
777
+ )
778
+
779
+ # Manage Bookmarks Tab
780
+ with gr.Tab("Manage Bookmarks"):
781
+ gr.Markdown("""
782
+ ## πŸ› οΈ **Manage Bookmarks**
783
+
784
+ ### πŸ—‚οΈ **Features:**
785
+
786
+ 1. **View Bookmarks:**
787
+ - All your processed bookmarks are displayed here with their respective categories and summaries.
788
+
789
+ 2. **Select Bookmarks:**
790
+ - Use the checkboxes next to each bookmark to select one, multiple, or all bookmarks you wish to manage.
791
+
792
+ 3. **Delete Selected Bookmarks:**
793
+ - After selecting the desired bookmarks, click the **"πŸ—‘οΈ Delete Selected"** button to remove them from your list.
794
+
795
+ 4. **Edit Categories:**
796
+ - Select the bookmarks you want to re-categorize.
797
+ - Choose a new category from the dropdown menu labeled **"πŸ†• New Category"**.
798
+ - Click the **"✏️ Edit Category"** button to update their categories.
799
+
800
+ 5. **Export Bookmarks:**
801
+ - Click the **"πŸ’Ύ Export"** button to download your updated bookmarks as an HTML file.
802
+
803
+ 6. **Refresh Bookmarks:**
804
+ - Click the **"πŸ”„ Refresh Bookmarks"** button to ensure the latest state is reflected in the display.
805
+ """)
806
+
807
+ manage_output = gr.Textbox(label="πŸ”„ Status", interactive=False)
808
+
809
+ # Move bookmark_selector here
810
+ bookmark_selector = gr.CheckboxGroup(
811
+ label="βœ… Select Bookmarks",
812
+ choices=[]
813
+ )
814
+
815
+ new_category = gr.Dropdown(
816
+ label="πŸ†• New Category",
817
+ choices=CATEGORIES,
818
+ value="Uncategorized"
819
+ )
820
+ bookmark_display_manage = gr.HTML(label="πŸ“„ Bookmarks")
821
+
822
+ with gr.Row():
823
+ delete_button = gr.Button("πŸ—‘οΈ Delete Selected")
824
+ edit_category_button = gr.Button("✏️ Edit Category")
825
+ export_button = gr.Button("πŸ’Ύ Export")
826
+ refresh_button = gr.Button("πŸ”„ Refresh Bookmarks")
827
+
828
+ download_link = gr.File(label="πŸ“₯ Download Exported Bookmarks")
829
+
830
+ # Connect all the button actions
831
+ process_button.click(
832
+ process_uploaded_file,
833
+ inputs=[upload, state_bookmarks],
834
+ outputs=[output_text, bookmark_display, state_bookmarks, bookmark_display, bookmark_selector]
835
+ )
836
+
837
+ delete_button.click(
838
+ delete_selected_bookmarks,
839
+ inputs=[bookmark_selector, state_bookmarks],
840
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage]
841
+ )
842
+
843
+ edit_category_button.click(
844
+ edit_selected_bookmarks_category,
845
+ inputs=[bookmark_selector, new_category, state_bookmarks],
846
+ outputs=[manage_output, bookmark_selector, bookmark_display_manage, state_bookmarks]
847
+ )
848
+
849
+ export_button.click(
850
+ export_bookmarks,
851
+ outputs=download_link
852
+ )
853
+
854
+ refresh_button.click(
855
+ lambda state_bookmarks: (
856
+ [
857
+ f"{i+1}. {bookmark['title']} (Category: {bookmark['category']})"
858
+ for i, bookmark in enumerate(state_bookmarks)
859
+ ],
860
+ display_bookmarks()
861
+ ),
862
+ inputs=[state_bookmarks],
863
+ outputs=[bookmark_selector, bookmark_display_manage]
864
+ )
865
+
866
+ logger.info("Launching Gradio app")
867
+ demo.launch(debug=True)
868
+ except Exception as e:
869
+ logger.error(f"Error building the app: {e}", exc_info=True)
870
+ print(f"Error building the app: {e}")
871
 
872
+ if __name__ == "__main__":
873
+ build_app()