File size: 3,766 Bytes
c9e6ba4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import uuid
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
import time
import argparse
from playwright.sync_api import sync_playwright

load_dotenv()

# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Set up Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

index_name = "lyca"  # Your index name

def ensure_index_exists():
    try:
        index = pc.Index(index_name)
        print(f"Index '{index_name}' already exists.")
    except Exception as e:
        print(f"Index '{index_name}' does not exist. Creating it now...")
        pc.create_index(
            name=index_name,
            dimension=3072,  # Dimension for text-embedding-3-large
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-west-2"
            )
        )
        print(f"Index '{index_name}' created successfully.")
    
    return pc.Index(index_name)

def get_embedding(text):
    response = client.embeddings.create(input=text, model="text-embedding-3-large")
    return response.data[0].embedding

def process_web_link(url):
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(url)
            
            # Wait for the content to load
            time.sleep(5)  # Adjust this value if needed
            
            # Get the full page content
            content = page.content()
            
            browser.close()

        # Parse the page content using BeautifulSoup
        soup = BeautifulSoup(content, 'lxml')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get text
        text = soup.get_text()
        
        # Clean up the text
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return text
    except Exception as e:
        print(f"Error processing web link {url}: {str(e)}")
        return f"Error processing {url}: {str(e)}"

def process_and_upsert_link(url, index):
    print(f"Processing {url}")
    content = process_web_link(url)
    doc_id = str(uuid.uuid4())
    content = content[:5000]
    content_length = len(content)
    print(f"Content extracted, length: {content_length}")

    embedding = get_embedding(content)
    vector = (doc_id, embedding, {
        "text": content, 
        "type": "Web Link",
        "doc_id": doc_id,
        "doc_name": url,
        "chunk_index": 0
    })
    
    print(f"Generated vector for {url}")
    
    index.upsert(vectors=[vector])
    print(f"Vector upserted to Pinecone for {url}")

def clean_database(index):
    try:
        print("Cleaning the database...")
        index.delete(delete_all=True)
        print("Database cleaned.")
    except Exception as e:
        print(f"Error cleaning database: {str(e)}")
        print("Continuing with the script...")

def main():
    parser = argparse.ArgumentParser(description="Process web links and upsert to Pinecone.")
    parser.add_argument("--clean", action="store_true", help="Clean the database before upserting")
    args = parser.parse_args()

    index = ensure_index_exists()

    if args.clean:
        clean_database(index)

    with open('links.txt', 'r') as file:
        links = [line.strip() for line in file if line.strip()]
    
    for link in links:
        process_and_upsert_link(link, index)

if __name__ == "__main__":
    main()