suvadityamuk commited on
Commit
1383c28
·
1 Parent(s): 2a0dafd

Signed-off-by: Suvaditya Mukherjee <[email protected]>

Files changed (3) hide show
  1. app.py +7 -1
  2. requirements.txt +3 -1
  3. utils.py +89 -1
app.py CHANGED
@@ -9,7 +9,7 @@ import spaces
9
  import pymupdf
10
  import gradio as gr
11
  from qdrant_client import QdrantClient
12
- from utils import download_pdf_from_gdrive, merge_strings_with_prefix
13
  from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
14
 
15
  def rag_query(query: str):
@@ -98,12 +98,18 @@ def update_chat_history(chat_history, tool_query, query_results):
98
  if __name__ == "__main__":
99
  RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
100
  RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
 
101
 
102
  # Download file
103
  download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
104
 
105
  doc = pymupdf.open(RESUME_PATH)
106
  fulltext = doc[0].get_text().split("\n")
 
 
 
 
 
107
  fulltext = merge_strings_with_prefix(fulltext)
108
 
109
  # Embed the sentences
 
9
  import pymupdf
10
  import gradio as gr
11
  from qdrant_client import QdrantClient
12
+ from utils import download_pdf_from_gdrive, merge_strings_with_prefix, scrape_website
13
  from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
14
 
15
  def rag_query(query: str):
 
98
  if __name__ == "__main__":
99
  RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
100
  RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
101
+ WEBSITE_URL = "https://www.suvadityamuk.com"
102
 
103
  # Download file
104
  download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
105
 
106
  doc = pymupdf.open(RESUME_PATH)
107
  fulltext = doc[0].get_text().split("\n")
108
+
109
+ # Scrape website
110
+ website_text = scrape_website(WEBSITE_URL)
111
+ fulltext = fulltext + website_text
112
+
113
  fulltext = merge_strings_with_prefix(fulltext)
114
 
115
  # Embed the sentences
requirements.txt CHANGED
@@ -13,4 +13,6 @@ optimum
13
  wandb
14
  psutil
15
  optimum-quanto
16
- pynvml
 
 
 
13
  wandb
14
  psutil
15
  optimum-quanto
16
+ pynvml
17
+ beautifulsoup4
18
+ requests
utils.py CHANGED
@@ -1,6 +1,10 @@
1
  import gdown
2
  import os
3
- from urllib.parse import urlparse, parse_qs
 
 
 
 
4
 
5
  def download_pdf_from_gdrive(url, output_path=None):
6
  """
@@ -83,3 +87,87 @@ def merge_strings_with_prefix(strings):
83
 
84
  return result
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gdown
2
  import os
3
+ from urllib.parse import urlparse, parse_qs, urljoin
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import time
7
+ from collections import deque
8
 
9
  def download_pdf_from_gdrive(url, output_path=None):
10
  """
 
87
 
88
  return result
89
 
90
+ def scrape_website(start_url, delay=1):
91
+ """
92
+ Scrapes all pages of a website and returns their content as a single string.
93
+
94
+ Args:
95
+ start_url (str): The starting URL of the website
96
+ delay (int): Delay between requests in seconds to be polite
97
+
98
+ Returns:
99
+ str: Combined content from all pages
100
+ """
101
+ # Initialize sets for tracking
102
+ visited_urls = set()
103
+ domain = urlparse(start_url).netloc
104
+ queue = deque([start_url])
105
+ all_content = []
106
+
107
+ def is_valid_url(url):
108
+ """Check if URL belongs to the same domain and is a webpage"""
109
+ parsed = urlparse(url)
110
+ return (
111
+ parsed.netloc == domain and
112
+ parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
113
+ '#' not in url
114
+ )
115
+
116
+ def extract_text_content(soup):
117
+ """Extract meaningful text content from a BeautifulSoup object"""
118
+ # Remove script and style elements
119
+ for script in soup(["script", "style", "header", "footer", "nav"]):
120
+ script.decompose()
121
+
122
+ # Get text content
123
+ text = soup.get_text(separator=' ', strip=True)
124
+
125
+ # Clean up whitespace
126
+ lines = (line.strip() for line in text.splitlines())
127
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
128
+ text = ' '.join(chunk for chunk in chunks if chunk)
129
+
130
+ return text
131
+
132
+ def get_links(soup, base_url):
133
+ """Extract all valid links from a page"""
134
+ links = []
135
+ for a_tag in soup.find_all('a', href=True):
136
+ url = urljoin(base_url, a_tag['href'])
137
+ if is_valid_url(url):
138
+ links.append(url)
139
+ return links
140
+
141
+ # Main scraping loop
142
+ while queue:
143
+ url = queue.popleft()
144
+ if url in visited_urls:
145
+ continue
146
+
147
+ try:
148
+ print(f"Scraping: {url}")
149
+ response = requests.get(url, timeout=10)
150
+ response.raise_for_status()
151
+
152
+ soup = BeautifulSoup(response.text, 'html.parser')
153
+
154
+ # Extract content
155
+ content = extract_text_content(soup)
156
+ all_content.append(f"URL: {url}\n{content}\n")
157
+
158
+ # Add new links to queue
159
+ links = get_links(soup, url)
160
+ for link in links:
161
+ if link not in visited_urls:
162
+ queue.append(link)
163
+
164
+ visited_urls.add(url)
165
+ time.sleep(delay) # Be polite
166
+
167
+ except Exception as e:
168
+ print(f"Error scraping {url}: {str(e)}")
169
+ continue
170
+
171
+ # Combine all content into a single string
172
+ combined_content = "\n\n".join(all_content)
173
+ return combined_content