Spaces:
Sleeping
Sleeping
Commit
·
1383c28
1
Parent(s):
2a0dafd
chore
Browse filesSigned-off-by: Suvaditya Mukherjee <[email protected]>
- app.py +7 -1
- requirements.txt +3 -1
- utils.py +89 -1
app.py
CHANGED
@@ -9,7 +9,7 @@ import spaces
|
|
9 |
import pymupdf
|
10 |
import gradio as gr
|
11 |
from qdrant_client import QdrantClient
|
12 |
-
from utils import download_pdf_from_gdrive, merge_strings_with_prefix
|
13 |
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
|
14 |
|
15 |
def rag_query(query: str):
|
@@ -98,12 +98,18 @@ def update_chat_history(chat_history, tool_query, query_results):
|
|
98 |
if __name__ == "__main__":
|
99 |
RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
|
100 |
RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
|
|
|
101 |
|
102 |
# Download file
|
103 |
download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
|
104 |
|
105 |
doc = pymupdf.open(RESUME_PATH)
|
106 |
fulltext = doc[0].get_text().split("\n")
|
|
|
|
|
|
|
|
|
|
|
107 |
fulltext = merge_strings_with_prefix(fulltext)
|
108 |
|
109 |
# Embed the sentences
|
|
|
9 |
import pymupdf
|
10 |
import gradio as gr
|
11 |
from qdrant_client import QdrantClient
|
12 |
+
from utils import download_pdf_from_gdrive, merge_strings_with_prefix, scrape_website
|
13 |
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
|
14 |
|
15 |
def rag_query(query: str):
|
|
|
98 |
if __name__ == "__main__":
|
99 |
RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
|
100 |
RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
|
101 |
+
WEBSITE_URL = "https://www.suvadityamuk.com"
|
102 |
|
103 |
# Download file
|
104 |
download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
|
105 |
|
106 |
doc = pymupdf.open(RESUME_PATH)
|
107 |
fulltext = doc[0].get_text().split("\n")
|
108 |
+
|
109 |
+
# Scrape website
|
110 |
+
website_text = scrape_website(WEBSITE_URL)
|
111 |
+
fulltext = fulltext + website_text
|
112 |
+
|
113 |
fulltext = merge_strings_with_prefix(fulltext)
|
114 |
|
115 |
# Embed the sentences
|
requirements.txt
CHANGED
@@ -13,4 +13,6 @@ optimum
|
|
13 |
wandb
|
14 |
psutil
|
15 |
optimum-quanto
|
16 |
-
pynvml
|
|
|
|
|
|
13 |
wandb
|
14 |
psutil
|
15 |
optimum-quanto
|
16 |
+
pynvml
|
17 |
+
beautifulsoup4
|
18 |
+
requests
|
utils.py
CHANGED
@@ -1,6 +1,10 @@
|
|
1 |
import gdown
|
2 |
import os
|
3 |
-
from urllib.parse import urlparse, parse_qs
|
|
|
|
|
|
|
|
|
4 |
|
5 |
def download_pdf_from_gdrive(url, output_path=None):
|
6 |
"""
|
@@ -83,3 +87,87 @@ def merge_strings_with_prefix(strings):
|
|
83 |
|
84 |
return result
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gdown
|
2 |
import os
|
3 |
+
from urllib.parse import urlparse, parse_qs, urljoin
|
4 |
+
import requests
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import time
|
7 |
+
from collections import deque
|
8 |
|
9 |
def download_pdf_from_gdrive(url, output_path=None):
|
10 |
"""
|
|
|
87 |
|
88 |
return result
|
89 |
|
90 |
+
def scrape_website(start_url, delay=1):
|
91 |
+
"""
|
92 |
+
Scrapes all pages of a website and returns their content as a single string.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
start_url (str): The starting URL of the website
|
96 |
+
delay (int): Delay between requests in seconds to be polite
|
97 |
+
|
98 |
+
Returns:
|
99 |
+
str: Combined content from all pages
|
100 |
+
"""
|
101 |
+
# Initialize sets for tracking
|
102 |
+
visited_urls = set()
|
103 |
+
domain = urlparse(start_url).netloc
|
104 |
+
queue = deque([start_url])
|
105 |
+
all_content = []
|
106 |
+
|
107 |
+
def is_valid_url(url):
|
108 |
+
"""Check if URL belongs to the same domain and is a webpage"""
|
109 |
+
parsed = urlparse(url)
|
110 |
+
return (
|
111 |
+
parsed.netloc == domain and
|
112 |
+
parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
|
113 |
+
'#' not in url
|
114 |
+
)
|
115 |
+
|
116 |
+
def extract_text_content(soup):
|
117 |
+
"""Extract meaningful text content from a BeautifulSoup object"""
|
118 |
+
# Remove script and style elements
|
119 |
+
for script in soup(["script", "style", "header", "footer", "nav"]):
|
120 |
+
script.decompose()
|
121 |
+
|
122 |
+
# Get text content
|
123 |
+
text = soup.get_text(separator=' ', strip=True)
|
124 |
+
|
125 |
+
# Clean up whitespace
|
126 |
+
lines = (line.strip() for line in text.splitlines())
|
127 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
128 |
+
text = ' '.join(chunk for chunk in chunks if chunk)
|
129 |
+
|
130 |
+
return text
|
131 |
+
|
132 |
+
def get_links(soup, base_url):
|
133 |
+
"""Extract all valid links from a page"""
|
134 |
+
links = []
|
135 |
+
for a_tag in soup.find_all('a', href=True):
|
136 |
+
url = urljoin(base_url, a_tag['href'])
|
137 |
+
if is_valid_url(url):
|
138 |
+
links.append(url)
|
139 |
+
return links
|
140 |
+
|
141 |
+
# Main scraping loop
|
142 |
+
while queue:
|
143 |
+
url = queue.popleft()
|
144 |
+
if url in visited_urls:
|
145 |
+
continue
|
146 |
+
|
147 |
+
try:
|
148 |
+
print(f"Scraping: {url}")
|
149 |
+
response = requests.get(url, timeout=10)
|
150 |
+
response.raise_for_status()
|
151 |
+
|
152 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
153 |
+
|
154 |
+
# Extract content
|
155 |
+
content = extract_text_content(soup)
|
156 |
+
all_content.append(f"URL: {url}\n{content}\n")
|
157 |
+
|
158 |
+
# Add new links to queue
|
159 |
+
links = get_links(soup, url)
|
160 |
+
for link in links:
|
161 |
+
if link not in visited_urls:
|
162 |
+
queue.append(link)
|
163 |
+
|
164 |
+
visited_urls.add(url)
|
165 |
+
time.sleep(delay) # Be polite
|
166 |
+
|
167 |
+
except Exception as e:
|
168 |
+
print(f"Error scraping {url}: {str(e)}")
|
169 |
+
continue
|
170 |
+
|
171 |
+
# Combine all content into a single string
|
172 |
+
combined_content = "\n\n".join(all_content)
|
173 |
+
return combined_content
|