File size: 3,766 Bytes

c9e6ba4

import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import uuid
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
import time
import argparse
from playwright.sync_api import sync_playwright

load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "lyca"  # Your index name

def ensure_index_exists():
    try:
        index = pc.Index(index_name)
        print(f"Index '{index_name}' already exists.")
    except Exception as e:
        print(f"Index '{index_name}' does not exist. Creating it now...")
        pc.create_index(
            name=index_name,
            dimension=3072,  # Dimension for text-embedding-3-large
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-west-2"
            )
        )
        print(f"Index '{index_name}' created successfully.")
    
    return pc.Index(index_name)

def get_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-large")
    return response.data[0].embedding

def process_web_link(url):
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(url)
            
            # Wait for the content to load
            time.sleep(5)  # Adjust this value if needed
            
            # Get the full page content
            content = page.content()
            
            browser.close()

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(content, 'lxml')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text
        text = soup.get_text()
        
        # Clean up the text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        print(f"Error processing web link {url}: {str(e)}")
        return f"Error processing {url}: {str(e)}"

def process_and_upsert_link(url, index):
    print(f"Processing {url}")
    content = process_web_link(url)
    doc_id = str(uuid.uuid4())
    content = content[:5000]
    content_length = len(content)
    print(f"Content extracted, length: {content_length}")

    embedding = get_embedding(content)
    vector = (doc_id, embedding, {
        "text": content, 
        "type": "Web Link",
        "doc_id": doc_id,
        "doc_name": url,
        "chunk_index": 0
    })
    
    print(f"Generated vector for {url}")
    
    index.upsert(vectors=[vector])
    print(f"Vector upserted to Pinecone for {url}")

def clean_database(index):
    try:
        print("Cleaning the database...")
        index.delete(delete_all=True)
        print("Database cleaned.")
    except Exception as e:
        print(f"Error cleaning database: {str(e)}")
        print("Continuing with the script...")

def main():
    parser = argparse.ArgumentParser(description="Process web links and upsert to Pinecone.")
    parser.add_argument("--clean", action="store_true", help="Clean the database before upserting")
    args = parser.parse_args()

    index = ensure_index_exists()

    if args.clean:
        clean_database(index)

    with open('links.txt', 'r') as file:
        links = [line.strip() for line in file if line.strip()]
    
    for link in links:
        process_and_upsert_link(link, index)

if __name__ == "__main__":
    main()