Spaces:

suvadityamuk
/

resume-rag

Sleeping

App Files Files Community

suvadityamuk commited on Feb 19

Commit

1383c28

1 Parent(s): 2a0dafd

chore

Browse files

Signed-off-by: Suvaditya Mukherjee <[email protected]>

Files changed (3) hide show

app.py +7 -1
requirements.txt +3 -1
utils.py +89 -1

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import spaces
 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
-from utils import download_pdf_from_gdrive, merge_strings_with_prefix
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
@@ -98,12 +98,18 @@ def update_chat_history(chat_history, tool_query, query_results):
 if __name__ == "__main__":
     RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
     RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
     # Download file
     download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
     doc = pymupdf.open(RESUME_PATH)
     fulltext = doc[0].get_text().split("\n")
     fulltext = merge_strings_with_prefix(fulltext)
     # Embed the sentences

 import pymupdf
 import gradio as gr
 from qdrant_client import QdrantClient
+from utils import download_pdf_from_gdrive, merge_strings_with_prefix, scrape_website
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
 def rag_query(query: str):
 if __name__ == "__main__":
     RESUME_PATH = os.path.join(os.getcwd(), "Resume.pdf")
     RESUME_URL = "https://drive.google.com/file/d/1YMF9NNTG5gubwJ7ipI5JfxAJKhlD9h2v/"
+    WEBSITE_URL = "https://www.suvadityamuk.com"
     # Download file
     download_pdf_from_gdrive(RESUME_URL, RESUME_PATH)
     doc = pymupdf.open(RESUME_PATH)
     fulltext = doc[0].get_text().split("\n")
+    # Scrape website
+    website_text = scrape_website(WEBSITE_URL)
+    fulltext = fulltext + website_text
     fulltext = merge_strings_with_prefix(fulltext)
     # Embed the sentences

requirements.txt CHANGED Viewed

@@ -13,4 +13,6 @@ optimum
 wandb
 psutil
 optimum-quanto
-pynvml

 wandb
 psutil
 optimum-quanto
+pynvml
+beautifulsoup4
+requests

utils.py CHANGED Viewed

@@ -1,6 +1,10 @@
 import gdown
 import os
-from urllib.parse import urlparse, parse_qs
 def download_pdf_from_gdrive(url, output_path=None):
     """
@@ -83,3 +87,87 @@ def merge_strings_with_prefix(strings):
     return result

 import gdown
 import os
+from urllib.parse import urlparse, parse_qs, urljoin
+import requests
+from bs4 import BeautifulSoup
+import time
+from collections import deque
 def download_pdf_from_gdrive(url, output_path=None):
     """
     return result
+def scrape_website(start_url, delay=1):
+    """
+    Scrapes all pages of a website and returns their content as a single string.
+    Args:
+        start_url (str): The starting URL of the website
+        delay (int): Delay between requests in seconds to be polite
+    Returns:
+        str: Combined content from all pages
+    """
+    # Initialize sets for tracking
+    visited_urls = set()
+    domain = urlparse(start_url).netloc
+    queue = deque([start_url])
+    all_content = []
+    def is_valid_url(url):
+        """Check if URL belongs to the same domain and is a webpage"""
+        parsed = urlparse(url)
+        return (
+            parsed.netloc == domain and
+            parsed.path.split('.')[-1] not in ['pdf', 'jpg', 'png', 'gif', 'jpeg'] and
+            '#' not in url
+        )
+    def extract_text_content(soup):
+        """Extract meaningful text content from a BeautifulSoup object"""
+        # Remove script and style elements
+        for script in soup(["script", "style", "header", "footer", "nav"]):
+            script.decompose()
+        # Get text content
+        text = soup.get_text(separator=' ', strip=True)
+        # Clean up whitespace
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+        return text
+    def get_links(soup, base_url):
+        """Extract all valid links from a page"""
+        links = []
+        for a_tag in soup.find_all('a', href=True):
+            url = urljoin(base_url, a_tag['href'])
+            if is_valid_url(url):
+                links.append(url)
+        return links
+    # Main scraping loop
+    while queue:
+        url = queue.popleft()
+        if url in visited_urls:
+            continue
+        try:
+            print(f"Scraping: {url}")
+            response = requests.get(url, timeout=10)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract content
+            content = extract_text_content(soup)
+            all_content.append(f"URL: {url}\n{content}\n")
+            # Add new links to queue
+            links = get_links(soup, url)
+            for link in links:
+                if link not in visited_urls:
+                    queue.append(link)
+            visited_urls.add(url)
+            time.sleep(delay)  # Be polite
+        except Exception as e:
+            print(f"Error scraping {url}: {str(e)}")
+            continue
+    # Combine all content into a single string
+    combined_content = "\n\n".join(all_content)
+    return combined_content