poemsforaphrodite commited on
Commit
c9e6ba4
1 Parent(s): cde8866

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. .gitignore +1 -0
  2. README.md +11 -0
  3. app.py +383 -0
  4. links.txt +223 -0
  5. requirements.txt +8 -0
  6. scrape.py +104 -0
  7. upsert.py +128 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: RAG Chat
3
+ emoji: ⚡
4
+ colorFrom: purple
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.37.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
app.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from openai import OpenAI
4
+ from PyPDF2 import PdfReader
5
+ import requests
6
+ from youtube_transcript_api import YouTubeTranscriptApi
7
+ from urllib.parse import urlparse, parse_qs
8
+ from pinecone import Pinecone
9
+ import uuid
10
+ from dotenv import load_dotenv
11
+ import time
12
+ from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from bs4 import BeautifulSoup
14
+ from selenium import webdriver
15
+ from selenium.webdriver.chrome.service import Service
16
+ from webdriver_manager.chrome import ChromeDriverManager
17
+ from selenium.webdriver.chrome.options import Options
18
+ import time
19
+ import re
20
+ from pymongo import MongoClient
21
+ from pymongo.errors import ConnectionFailure
22
+ from datetime import datetime
23
+
24
+ # Set page config at the very beginning
25
+ st.set_page_config(layout="wide")
26
+
27
+ # Load environment variables
28
+ load_dotenv()
29
+
30
+ # Set up OpenAI client
31
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
32
+
33
+ # Set up Pinecone
34
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
35
+
36
+ index_name = "lyca" # Your index name
37
+ index = pc.Index(index_name)
38
+
39
+ # Set up MongoDB connection
40
+ mongo_uri = os.getenv("MONGODB_URI")
41
+ if not mongo_uri:
42
+ st.error("MONGO_URI is not set. Please check your .env file.")
43
+ else:
44
+ print(f"MONGO_URI loaded: {mongo_uri[:10]}...") # Print only first 10 chars for security
45
+
46
+
47
+ try:
48
+ client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
49
+ client.server_info() # This will raise an exception if the connection fails
50
+ db = client['lyca']
51
+ sim_swap_collection = db['sim_swap_requests']
52
+ except ConnectionFailure:
53
+ st.error("Failed to connect to MongoDB. Please check your connection and try again later.")
54
+ sim_swap_collection = None
55
+
56
+ def get_embedding(text):
57
+ response = client.embeddings.create(input=text, model="text-embedding-3-large")
58
+ return response.data[0].embedding
59
+
60
+ def process_pdf(file):
61
+ reader = PdfReader(file)
62
+ text = ""
63
+ for page in reader.pages:
64
+ text += page.extract_text() + "\n"
65
+ return text
66
+
67
+ def process_web_link(url):
68
+ try:
69
+ # Set up Selenium options
70
+ chrome_options = Options()
71
+ chrome_options.add_argument("--headless") # Run in headless mode for performance
72
+ chrome_options.add_argument("--no-sandbox")
73
+ chrome_options.add_argument("--disable-dev-shm-usage")
74
+
75
+ # Install the Chrome driver automatically using webdriver-manager
76
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
77
+
78
+ # Navigate to the URL
79
+ driver.get(url)
80
+
81
+ # Give the page some time to load fully
82
+ time.sleep(3)
83
+
84
+ # Extract the rendered page's content
85
+ page_source = driver.page_source
86
+
87
+ # Close the browser after extracting content
88
+ driver.quit()
89
+
90
+ # Parse the page content using BeautifulSoup
91
+ soup = BeautifulSoup(page_source, 'lxml')
92
+
93
+ # Remove script and style elements
94
+ for script in soup(["script", "style"]):
95
+ script.decompose()
96
+
97
+ # Get text
98
+ text = soup.get_text()
99
+
100
+ # Clean up the text
101
+ lines = (line.strip() for line in text.splitlines())
102
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
103
+ text = '\n'.join(chunk for chunk in chunks if chunk)
104
+
105
+ return text
106
+ except Exception as e:
107
+ print(f"Error processing web link {url}: {str(e)}")
108
+ return f"Error processing {url}: {str(e)}"
109
+
110
+ def process_youtube_link(url):
111
+ video_id = extract_video_id(url)
112
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
113
+ return " ".join([entry['text'] for entry in transcript])
114
+
115
+ def extract_video_id(url):
116
+ parsed_url = urlparse(url)
117
+ if parsed_url.hostname == 'youtu.be':
118
+ return parsed_url.path[1:]
119
+ if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
120
+ if parsed_url.path == '/watch':
121
+ return parse_qs(parsed_url.query)['v'][0]
122
+ if parsed_url.path[:7] == '/embed/':
123
+ return parsed_url.path.split('/')[2]
124
+ if parsed_url.path[:3] == '/v/':
125
+ return parsed_url.path.split('/')[2]
126
+ return None
127
+
128
+ def process_upload(upload_type, file_or_link, file_name=None):
129
+ print(f"Starting process_upload for {upload_type}")
130
+ doc_id = str(uuid.uuid4())
131
+ print(f"Generated doc_id: {doc_id}")
132
+
133
+ if upload_type == "PDF":
134
+ content = process_pdf(file_or_link)
135
+ doc_name = file_name or "Uploaded PDF"
136
+ elif upload_type == "Web Link":
137
+ content = process_web_link(file_or_link)
138
+ doc_name = file_or_link
139
+ elif upload_type == "YouTube Link":
140
+ content = process_youtube_link(file_or_link)
141
+ doc_name = f"YouTube: {file_or_link}"
142
+ else:
143
+ print("Invalid upload type")
144
+ return "Invalid upload type"
145
+
146
+ content_length = len(content)
147
+ print(f"Content extracted, length: {content_length}")
148
+
149
+ # Dynamically adjust chunk size based on content length
150
+ if content_length < 10000:
151
+ chunk_size = 1000
152
+ elif content_length < 100000:
153
+ chunk_size = 2000
154
+ else:
155
+ chunk_size = 4000
156
+ print(f"Using chunk size: {chunk_size}")
157
+
158
+ chunks = [content[i:i+chunk_size] for i in range(0, content_length, chunk_size)]
159
+
160
+ vectors = []
161
+ with ThreadPoolExecutor() as executor:
162
+ futures = [executor.submit(process_chunk, chunk, doc_id, i, upload_type, doc_name) for i, chunk in enumerate(chunks)]
163
+
164
+ for future in as_completed(futures):
165
+ vectors.append(future.result())
166
+ # Update progress
167
+ progress = len(vectors) / len(chunks)
168
+ st.session_state.upload_progress.progress(progress)
169
+
170
+ print(f"Generated {len(vectors)} vectors")
171
+
172
+ index.upsert(vectors=vectors)
173
+ print("Vectors upserted to Pinecone")
174
+
175
+ return f"Processing complete for {upload_type}. Document Name: {doc_name}"
176
+
177
+ def process_chunk(chunk, doc_id, i, upload_type, doc_name):
178
+ embedding = get_embedding(chunk)
179
+ return (f"{doc_id}_{i}", embedding, {
180
+ "text": chunk,
181
+ "type": upload_type,
182
+ "doc_id": doc_id,
183
+ "doc_name": doc_name,
184
+ "chunk_index": i
185
+ })
186
+
187
+ def get_relevant_context(query, top_k=5):
188
+ print(f"Getting relevant context for query: {query}")
189
+ query_embedding = get_embedding(query)
190
+
191
+ search_results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
192
+ print(f"Found {len(search_results['matches'])} relevant results")
193
+
194
+ # Sort results by doc_id and chunk_index to maintain document structure
195
+ sorted_results = sorted(search_results['matches'], key=lambda x: (x['metadata']['doc_id'], x['metadata']['chunk_index']))
196
+
197
+ context = "\n".join([result['metadata']['text'] for result in sorted_results])
198
+ return context, sorted_results
199
+
200
+ def check_lyca_data_loaded():
201
+ # Check if there are any vectors in the index
202
+ stats = index.describe_index_stats()
203
+ return stats['total_vector_count'] > 0
204
+
205
+ def load_lyca_mobile_data():
206
+ if check_lyca_data_loaded():
207
+ return "Lyca Mobile data is already loaded."
208
+
209
+ lyca_links = [line.strip() for line in open('links.txt', 'r')]
210
+ for link in lyca_links:
211
+ process_upload("Web Link", link)
212
+ return "Lyca Mobile data loaded into vector database"
213
+
214
+ def general_conversation(message):
215
+ response = client.chat.completions.create(
216
+ model="gpt-4o-mini",
217
+ messages=[
218
+ {"role": "system", "content": "You are a helpful assistant for Lyca Mobile customers. If you don't know the answer, politely say so."},
219
+ {"role": "user", "content": message}
220
+ ]
221
+ )
222
+ return response.choices[0].message.content
223
+
224
+ def is_sim_swap_request(message):
225
+ sim_swap_keywords = {'sim', 'swap', 'change', 'new', 'replace'}
226
+ # Remove the question mark at the end if it exists
227
+ message = message.rstrip('?')
228
+ message_words = set(message.lower().split())
229
+ return len(sim_swap_keywords.intersection(message_words)) >= 2
230
+
231
+ # Add a print statement for debugging
232
+ print(f"is_sim_swap_request result: {is_sim_swap_request('how to change my sim?')}")
233
+
234
+ def trigger_sim_swap_workflow():
235
+ st.session_state.workflow = 'sim_swap'
236
+ st.session_state.workflow_step = 0
237
+
238
+ def process_sim_swap_workflow():
239
+ st.subheader("SIM Swap Request Form")
240
+
241
+ with st.form("sim_swap_form"):
242
+ full_name = st.text_input("Please enter your full name:")
243
+ phone_number = st.text_input("Please enter your phone number:")
244
+ email = st.text_input("Please enter your email address:")
245
+ current_sim = st.text_input("Please enter your current SIM card number:")
246
+ reason = st.text_area("Please enter the reason for SIM swap:")
247
+
248
+ submitted = st.form_submit_button("Submit")
249
+
250
+ if submitted:
251
+ if sim_swap_collection is None:
252
+ st.error("Unable to process your request due to a database connection issue. Please try again later.")
253
+ else:
254
+ user_data = {
255
+ "full_name": full_name,
256
+ "phone_number": phone_number,
257
+ "email": email,
258
+ "current_sim": current_sim,
259
+ "reason": reason,
260
+ "timestamp": datetime.now()
261
+ }
262
+
263
+ try:
264
+ sim_swap_collection.insert_one(user_data)
265
+ st.success("Thank you for providing your information. Your SIM swap request has been submitted and stored successfully.")
266
+ st.session_state.workflow = None
267
+ except Exception as e:
268
+ st.error(f"An error occurred while storing your information: {str(e)}")
269
+ st.warning("Please try submitting your request again. If the problem persists, please contact support.")
270
+
271
+ def chat_with_ai(message):
272
+ try:
273
+ query_embedding = get_embedding(message)
274
+ context, results = get_relevant_context(message)
275
+
276
+ if results and results[0]['score'] >= 0.4:
277
+ messages = [
278
+ {"role": "system", "content": "You are a helpful assistant for Lyca Mobile. Use the following information to answer the user's question, but don't mention the context directly in your response. If the information isn't in the context, say you don't know."},
279
+ {"role": "system", "content": f"Context: {context}"},
280
+ {"role": "user", "content": message}
281
+ ]
282
+
283
+ response = client.chat.completions.create(
284
+ model="gpt-4o-mini",
285
+ messages=messages
286
+ )
287
+
288
+ ai_response = response.choices[0].message.content
289
+
290
+ sources = [
291
+ {
292
+ "doc_id": result['metadata']['doc_id'],
293
+ "doc_name": result['metadata']['doc_name'],
294
+ "chunk_index": result['metadata']['chunk_index'],
295
+ "text": result['metadata']['text'],
296
+ "type": result['metadata']['type'],
297
+ "score": result['score']
298
+ }
299
+ for result in results
300
+ ]
301
+ else:
302
+ # Fallback to general conversation if no relevant context is found or similarity is low
303
+ ai_response = general_conversation(message)
304
+ sources = []
305
+
306
+ return ai_response, sources
307
+ except Exception as e:
308
+ print(f"Error in chat_with_ai: {str(e)}")
309
+ return "I'm sorry, but I encountered an error while processing your request. Please try again later.", []
310
+
311
+ def clear_database():
312
+ print("Clearing database...")
313
+ index.delete(delete_all=True)
314
+ print("Database cleared")
315
+ return "Database cleared successfully."
316
+
317
+ # Streamlit UI
318
+ st.title("Lyca Mobile Assistant")
319
+
320
+ if 'workflow' not in st.session_state:
321
+ st.session_state.workflow = None
322
+ st.session_state.workflow_data = []
323
+ st.session_state.workflow_step = 0
324
+
325
+ if 'chat_history' not in st.session_state:
326
+ st.session_state.chat_history = []
327
+
328
+ # Create two columns instead of three
329
+ col1, col2 = st.columns([2, 1])
330
+
331
+ with col1:
332
+ st.header("Chat")
333
+
334
+ if st.session_state.workflow == 'sim_swap':
335
+ process_sim_swap_workflow()
336
+ else:
337
+ # Display chat history
338
+ for message in st.session_state.chat_history:
339
+ st.markdown(f"**{'You' if message['role'] == 'user' else 'AI'}:** {message['content']}")
340
+
341
+ user_input = st.text_input("How can I assist you with Lyca Mobile today?")
342
+ if st.button("Send"):
343
+ if user_input:
344
+ # Add debug print
345
+ print(f"User input: {user_input}")
346
+ is_swap_request = is_sim_swap_request(user_input)
347
+ print(f"Is sim swap request: {is_swap_request}")
348
+
349
+ if is_swap_request:
350
+ print("Triggering SIM swap workflow")
351
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
352
+ st.session_state.chat_history.append({"role": "assistant", "content": "Certainly! I can help you with changing your SIM. Please fill out the following form to start the SIM swap process."})
353
+ st.session_state.workflow = 'sim_swap'
354
+ else:
355
+ print("Proceeding with regular chat flow")
356
+ # Existing code for non-sim-swap requests
357
+ st.session_state.chat_progress = st.progress(0)
358
+ response, sources = chat_with_ai(user_input)
359
+ st.session_state.chat_progress.progress(1.0)
360
+
361
+ # Add to chat history
362
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
363
+ st.session_state.chat_history.append({"role": "assistant", "content": response})
364
+
365
+ # Display the latest messages
366
+ st.markdown("**You:** " + user_input)
367
+ st.markdown("**AI:** " + response)
368
+
369
+ # Store sources in session state for display in col2
370
+ st.session_state.sources = sources
371
+ st.session_state.chat_progress.empty()
372
+ else:
373
+ st.warning("Please enter a question.")
374
+
375
+ with col2:
376
+ st.header("Source Information")
377
+ if 'sources' in st.session_state and st.session_state.sources:
378
+ for i, source in enumerate(st.session_state.sources, 1):
379
+ with st.expander(f"Source {i} - {source['type']} ({source['doc_name']})"):
380
+ st.markdown(f"**Chunk Index:** {source['chunk_index']}")
381
+ st.text(source['text'])
382
+ else:
383
+ st.info("Ask a question to see source information here.")
links.txt ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://www.lycamobile.us/faq/do-you-offer-4g-hspa
2
+ https://www.lycamobile.us/help/frequently-asked-question
3
+ https://www.lycamobile.us/help/contact-us
4
+ https://www.lycamobile.us/en/online-security
5
+ https://www.lycamobile.us/store-locator
6
+ https://www.lycamobile.us/international-credit/can-i-schedule-automatic-top-up-of-international-credit-for-my-lyca-mobile-account
7
+ https://www.lycamobile.us/faq/how-to-unlock-my-handset
8
+ https://www.lycamobile.us/activate-sim
9
+ https://www.lycamobile.us/faq/how-do-i-make-an-international-call-with-lyca-mobile
10
+ https://www.lycamobile.us/international-credit/how-can-i-top-up-international-credit-my-lyca-mobile-number
11
+ https://www.lycamobile.us/how-to/how-do-i-check-if-a-bundle-is-active-on-my-lycamobile
12
+ https://www.lycamobile.us/help-support
13
+ https://www.lycamobile.us/use-of-this-website
14
+ https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile/
15
+ https://www.lycamobile.us/california-mts/
16
+ https://www.lycamobile.us/about-us
17
+ https://www.lycamobile.us/help/mobile-web-settings/
18
+ https://www.lycamobile.us/how-to/how-do-i-send-a-text-message-from-the-us-to-another-country-with-lyca-mobile
19
+ https://www.lycamobile.us/faq/how-do-i-activate-roaming-facility-using-lyca-mobile
20
+ https://www.lycamobile.us/how-to/how-to-activate-my-new-sim-and-prepay-plan
21
+ https://www.lycamobile.us/termscondition
22
+ https://www.lycamobile.us/how-to/how-do-i-make-an-international-call-with-lyca-mobile
23
+ https://www.lycamobile.us/registration
24
+ https://www.lycamobile.us/faq/how-much-does-it-cost-to-access-the-voicemail
25
+ https://www.lycamobile.us/help/wi-fi-calling-and-text/
26
+ https://www.lycamobile.us/how-to/how-to-retrieve-your-lyca-mobile-number
27
+ https://www.lycamobile.us/help/order
28
+ https://www.lycamobile.us/faq/how-do-i-set-up-auto-renewal
29
+ https://www.lycamobile.us/ios
30
+ https://www.lycamobile.us/faq/where-can-i-find-my-lyca-mobile-number
31
+ https://www.lycamobile.us/faq/can-i-send-premium-sms-and-make-premium-calls-using-lyca-mobile
32
+ https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
33
+ https://www.lycamobile.us/how-to/how-long-would-it-take-to-swap-my-sim-to-a-plus-sim-card
34
+ https://www.lycamobile.us/en/
35
+ https://www.lycamobile.us/how-to/how-do-i-check-my-lycamobile-number-data-and-call
36
+ https://www.lycamobile.us/help/international-credit
37
+ https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans/
38
+ https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND
39
+ https://www.lycamobile.us/joined-lyca/how-to-switch-to-lyca-mobile
40
+ https://www.lycamobile.us/about-to-join-lyca-mobile/what-information-do-i-need-to-provide
41
+ https://www.lycamobile.us/help/how-to
42
+ https://www.lycamobile.us/faq/i-have-not-used-my-lyca-mobile-for-a-while-and-it-has-now-stopped-working-why-is-this
43
+ https://www.lycamobile.us/en/lia-chat
44
+ https://www.lycamobile.us/2g_shutdown/
45
+ https://www.lycamobile.us/en/cookie-policy
46
+ https://www.lycamobile.us/help/data-add-on
47
+ https://www.lycamobile.us/faq/i-have-forgotten-my-pin-puk-code-where-can-i-find-it
48
+ https://www.lycamobile.us/joined-lyca/can-i-retain-my-current-mobile-number
49
+ https://www.lycamobile.us/help/esim
50
+ https://www.lycamobile.us/help/portin-status
51
+ https://www.lycamobile.us/faq/how-can-i-stop-receiving-unwanted-texts-or-spam
52
+ https://www.lycamobile.us/en/plans/prepay-plans/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND#best-value
53
+ https://www.lycamobile.us/international-credit/can-i-transfer-my-top-up-balance-to-another-customer
54
+ https://www.lycamobile.us/blog/en/
55
+ https://www.lycamobile.us/faq/what-is-my-lyca-mobile
56
+ https://www.lycamobile.us/faq/how-long-would-it-take-to-swap-my-sim-to-a-plus-sim-card
57
+ https://www.lycamobile.us/faq/do-i-have-to-sign-a-contract
58
+ https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans
59
+ https://www.lycamobile.us/plans/prepaid-phone-plans#30-days-plans
60
+ https://www.lycamobile.us/faq/does-lyca-mobile-charge-taxes-on-my-order
61
+ https://www.lycamobile.us/faq/how-do-i-send-a-text-message-from-the-us-to-another-country-with-lyca-mobile
62
+ https://www.lycamobile.us/already-joined-us/how-to-enable-auto-renewal
63
+ https://www.lycamobile.us/android
64
+ https://www.lycamobile.us/port-in/
65
+ https://www.lycamobile.us/become-a-retailer
66
+ https://www.lycamobile.us/joined-lyca/can-i-top-up-online-my-lyca-mobile
67
+ https://www.lycamobile.us/international-credit/how-can-i-check-my-balance-after-topping-up
68
+ https://www.lycamobile.us/how-to/how-to-check-balance-of-my-plan-allowance
69
+ https://www.lycamobile.us/california-billing-notice
70
+ https://www.lycamobile.us/en/freesim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinLyca-EGO_USA_ENG_WCO_GLP_BRND
71
+ https://www.lycamobile.us/faq/are-there-any-monthly-or-hidden-charges
72
+ https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=HelpFAQ-EGO_USA_ENG_WCO_GLP_BRND
73
+ https://www.lycamobile.us/help/general-faq
74
+ https://www.lycamobile.us/faq/does-lyca-mobile-offer-group-porting/multi-subscription
75
+ https://www.lycamobile.us/international-credit/how-can-i-view-my-international-credit-top-up-history-and-transactions
76
+ https://www.lycamobile.us/port-in
77
+ https://www.lycamobile.us/help/frequently-asked-question/
78
+ https://www.lycamobile.us/help/renewal
79
+ https://www.lycamobile.us/faq/what-different-plans-do-you-offer
80
+ https://www.lycamobile.us/cheap_call/cheap-calls-to-india
81
+ https://www.lycamobile.us/help/data-allowance
82
+ https://www.lycamobile.us/help/4g-coverage-and-services
83
+ https://www.lycamobile.us/en/lia-chat/
84
+ https://www.lycamobile.us/en/registration/?utm_source=website&utm_medium=onpage&utm_campaign=HelpFAQ-EGO_USA_ENG_WCO_GLP_BRND
85
+ https://www.lycamobile.us/how-to/do-you-have-any-monthly-or-hidden-fees
86
+ https://www.lycamobile.us/about-to-join-lyca-mobile/do-i-need-to-sign-a-contract-for-a-prepay
87
+ https://www.lycamobile.us/en/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
88
+ https://www.lycamobile.us/quick-top-up
89
+ https://www.lycamobile.us/about-to-join-lyca-mobile/how-to-order-a-lyca-mobile-sim
90
+ https://www.lycamobile.us/joined-lyca/what-is-the-process-for-obtaining-my-lyca-mobile
91
+ https://www.lycamobile.us/how-to/how-to-check-lycamobile-internet-data-balance
92
+ https://www.lycamobile.us/faq/how-can-i-deactivate-my-voicemail-service
93
+ https://www.lycamobile.us/faq/is-this-website-accessible
94
+ https://www.lycamobile.us/how-to/how-can-i-deactivate-my-voicemail-service
95
+ https://www.lycamobile.us/faq/why-is-there-a-flashing-envelope-on-my-cell
96
+ https://www.lycamobile.us/help/joined-lyca
97
+ https://www.lycamobile.us/international-credit/what-do-i-do-if-i-face-issues-while-topping-up-international-credit-online
98
+ https://www.lycamobile.us/freesim
99
+ https://www.lycamobile.us/international-credit/is-there-a-minimum-top-up-amount-for-my-prepay-number
100
+ https://www.lycamobile.us/help-support/
101
+ https://www.lycamobile.us/help/about-to-join-lyca-mobile
102
+ https://www.lycamobile.us/become-a-retailer/
103
+ https://www.lycamobile.us/how-to/i-have-not-used-my-lycamobile-for-a-while-and-it-has-now-stopped-working-why-is-this
104
+ https://www.lycamobile.us/international-credit/what-does-international-credit-addon-mean
105
+ https://www.lycamobile.us/plans/prepaid-phone-plans#long-term-plans
106
+ https://www.lycamobile.us/en/activate-sim/
107
+ https://www.lycamobile.us/about-to-join-lyca-mobile/is-it-mandatory-to-order-lyca-mobile
108
+ https://www.lycamobile.us/rates/national
109
+ https://www.lycamobile.us/faq/how-do-i-contact-someone-if-i-have-a-problem
110
+ https://www.lycamobile.us/studentbeans/
111
+ https://www.lycamobile.us/privacy-policy
112
+ https://www.lycamobile.us/en/activate-sim/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
113
+ https://www.lycamobile.us/help/lycamobile.co.uk
114
+ https://www.lycamobile.us/activate-plan/
115
+ https://www.lycamobile.us/help/sms-notifications
116
+ https://www.lycamobile.us/joined-lyca/how-to-activate-your-pay-as-you-go-sim
117
+ https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
118
+ https://www.lycamobile.us/how-to/how-to-activate-mobile-internet-on-my-phone
119
+ https://www.lycamobile.us/help/General
120
+ https://www.lycamobile.us/faq/how-to-activate-my-sim-card
121
+ https://www.lycamobile.us/faq/my-sim-card-is-lost-stolen-how-do-i-prevent-someone-else-from-using-it
122
+ https://www.lycamobile.us/how-to/how-can-i-do-a-quick-recharge-or-refill-my-lyca-prepay-number-international-calling-credit
123
+ https://www.lycamobile.us/en/quick-top-up/?utm_source=website&utm_medium=onpage&utm_campaign=JoinedLyca-EGO_USA_ENG_WCO_GLP_BRND
124
+ https://www.lycamobile.us/plans/buy-a-additional-line
125
+ https://www.lycamobile.us/help/already-joined-us
126
+ https://www.lycamobile.us/plan-changes-update/
127
+ https://www.lycamobile.us/already-joined-us/how-to-manage-your-saved-credit-debit-cards
128
+ https://www.lycamobile.us/faq/which-mobile-handsets-can-i-use-with-lyca-mobile
129
+ https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
130
+ https://www.lycamobile.us/help/contact-us
131
+ https://www.lycamobile.us/ios
132
+ https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
133
+ https://www.lycamobile.us/cheap_call/cheap-calls-to-india
134
+ https://www.lycamobile.us/store-locator
135
+ https://www.lycamobile.us/help/portin-status
136
+ https://www.lycamobile.us/quick-top-up
137
+ https://www.lycamobile.us/termscondition
138
+ https://www.lycamobile.us/activate-plan/
139
+ https://www.lycamobile.us/become-a-retailer/
140
+ https://www.lycamobile.us/activate-sim
141
+ https://www.lycamobile.us/help-support
142
+ https://www.lycamobile.us/help/mobile-web-settings/
143
+ https://www.lycamobile.us/en/cookie-policy
144
+ https://www.lycamobile.us/rates/national
145
+ https://www.lycamobile.us/plans/prepaid-phone-plans#30-days-plans
146
+ https://www.lycamobile.us/use-of-this-website
147
+ https://www.lycamobile.us/registration
148
+ https://www.lycamobile.us/plans/prepaid-phone-plans/refill-plans/
149
+ https://www.lycamobile.us/california-mts/
150
+ https://www.lycamobile.us/en/activate-sim/
151
+ https://www.lycamobile.us/en/lia-chat/
152
+ https://www.lycamobile.us/en/online-security
153
+ https://www.lycamobile.us/studentbeans/
154
+ https://www.lycamobile.us/california-billing-notice
155
+ https://www.lycamobile.us/become-a-retailer
156
+ https://www.lycamobile.us/plan-changes-update/
157
+ https://www.lycamobile.us/2g_shutdown/
158
+ https://www.lycamobile.us/android
159
+ https://www.lycamobile.us/port-in
160
+ https://www.lycamobile.us/en/lia-chat
161
+ https://www.lycamobile.us/help/frequently-asked-question
162
+ https://www.lycamobile.us/freesim
163
+ https://www.lycamobile.us/help/4g-coverage-and-services
164
+ https://www.lycamobile.us/plans/prepaid-phone-plans#long-term-plans
165
+ https://www.lycamobile.us/blog/en/
166
+ https://www.lycamobile.us/about-us
167
+ https://www.lycamobile.us/help/frequently-asked-question/
168
+ https://www.lycamobile.us/privacy-policy
169
+ https://www.lycamobile.us/become-a-retailer/
170
+ https://www.lycamobile.us/help/General
171
+ https://www.lycamobile.us/help/data-add-on
172
+ https://www.lycamobile.us/blog/en/
173
+ https://www.lycamobile.us/activate-sim
174
+ https://www.lycamobile.us/studentbeans/
175
+ https://www.lycamobile.us/use-of-this-website
176
+ https://www.lycamobile.us/help/portin-status
177
+ https://www.lycamobile.us/california-billing-notice
178
+ https://www.lycamobile.us/california-mts/
179
+ https://www.lycamobile.us/help/about-to-join-lyca-mobile
180
+ https://www.lycamobile.us/en/cookie-policy
181
+ https://www.lycamobile.us/help/mobile-web-settings/
182
+ https://www.lycamobile.us/quick-top-up
183
+ https://www.lycamobile.us/registration
184
+ https://www.lycamobile.us/about-to-join-lyca-mobile/do-i-need-to-sign-a-contract-for-a-prepay
185
+ https://www.lycamobile.us/help/contact-us
186
+ https://www.lycamobile.us/activate-plan/
187
+ https://www.lycamobile.us/help/data-allowance
188
+ https://www.lycamobile.us/about-to-join-lyca-mobile/how-to-order-a-lyca-mobile-sim
189
+ https://www.lycamobile.us/rates/national
190
+ https://www.lycamobile.us/help/international-credit
191
+ https://www.lycamobile.us/en/online-security
192
+ https://www.lycamobile.us/joined-lyca/can-i-top-up-online-my-lyca-mobile
193
+ https://www.lycamobile.us/android
194
+ https://www.lycamobile.us/about-us
195
+ https://www.lycamobile.us/ios
196
+ https://www.lycamobile.us/store-locator
197
+ https://www.lycamobile.us/help/how-to
198
+ https://www.lycamobile.us/help/order
199
+ https://www.lycamobile.us/plans/prepaid-phone-plans#best-value
200
+ https://www.lycamobile.us/help/general-faq
201
+ https://www.lycamobile.us/help/frequently-asked-question
202
+ https://www.lycamobile.us/help/already-joined-us
203
+ https://www.lycamobile.us/help/esim
204
+ https://www.lycamobile.us/help/joined-lyca
205
+ https://www.lycamobile.us/already-joined-us/how-to-enable-auto-renewal
206
+ https://www.lycamobile.us/help/sms-notifications
207
+ https://www.lycamobile.us/termscondition
208
+ https://www.lycamobile.us/help/how-to-switch-to-lyca-mobile
209
+ https://www.lycamobile.us/help/wi-fi-calling-and-text/
210
+ https://www.lycamobile.us/privacy-policy
211
+ https://www.lycamobile.us/help/renewal
212
+ https://www.lycamobile.us/joined-lyca/how-to-activate-your-pay-as-you-go-sim
213
+ https://www.lycamobile.us/en/
214
+ https://www.lycamobile.us/joined-lyca/can-i-retain-my-current-mobile-number
215
+ https://www.lycamobile.us/help-support
216
+ https://www.lycamobile.us/plan-changes-update/
217
+ https://www.lycamobile.us/about-to-join-lyca-mobile/is-it-mandatory-to-order-lyca-mobile
218
+ https://www.lycamobile.us/about-to-join-lyca-mobile/what-information-do-i-need-to-provide
219
+ https://www.lycamobile.us/joined-lyca/how-to-switch-to-lyca-mobile
220
+ https://www.lycamobile.us/help/4g-coverage-and-services
221
+ https://www.lycamobile.us/already-joined-us/how-to-manage-your-saved-credit-debit-cards
222
+ https://www.lycamobile.us/freesim
223
+ https://www.lycamobile.us/2g_shutdown/
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openai
2
+ pinecone-client
3
+ python-dotenv
4
+ beautifulsoup4
5
+ selenium
6
+ webdriver-manager
7
+ lxml
8
+ uuid
scrape.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ from selenium.webdriver.chrome.options import Options
4
+ from webdriver_manager.chrome import ChromeDriverManager
5
+ from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin, urlparse
7
+ import time
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+ import threading
10
+
11
+ # Create a lock for thread-safe operations
12
+ visited_lock = threading.Lock()
13
+
14
+ # Thread-safe set for visited URLs
15
+ visited = set()
16
+
17
+ # Function to scrape links with depth control
18
+ def get_all_links(url, max_depth, current_depth=0):
19
+ if current_depth > max_depth:
20
+ return []
21
+
22
+ try:
23
+ # Print the current URL being scraped
24
+ print(f"Scraping: {url} at depth {current_depth}")
25
+
26
+ # Set up Chrome options
27
+ chrome_options = Options()
28
+ chrome_options.add_argument("--headless") # Run in headless mode
29
+
30
+ # Set up the Chrome driver
31
+ service = Service(ChromeDriverManager().install())
32
+ driver = webdriver.Chrome(service=service, options=chrome_options)
33
+
34
+ # Navigate to the URL
35
+ driver.get(url)
36
+
37
+ # Wait for the page to load (adjust the sleep time if needed)
38
+ time.sleep(5)
39
+
40
+ # Get the page source and parse it with BeautifulSoup
41
+ soup = BeautifulSoup(driver.page_source, 'html.parser')
42
+
43
+ # Find all 'a' tags and extract the 'href' attribute
44
+ links = set()
45
+ for a_tag in soup.find_all('a', href=True):
46
+ href = a_tag['href']
47
+ full_url = urljoin(url, href)
48
+
49
+ # Only include links from the same domain and not already visited
50
+ with visited_lock:
51
+ if urlparse(full_url).netloc == urlparse(url).netloc and full_url not in visited:
52
+ visited.add(full_url)
53
+ links.add(full_url)
54
+
55
+ # Close the browser
56
+ driver.quit()
57
+
58
+ return list(links)
59
+
60
+ except Exception as e:
61
+ print(f"Error fetching the URL: {e}")
62
+ return []
63
+
64
+ def scrape_recursive(urls, max_depth, current_depth, executor):
65
+ if current_depth > max_depth:
66
+ return []
67
+
68
+ # Submit tasks for the URLs to the ThreadPoolExecutor
69
+ futures = [executor.submit(get_all_links, url, max_depth, current_depth) for url in urls]
70
+ all_links = set()
71
+
72
+ for future in as_completed(futures):
73
+ try:
74
+ links = future.result()
75
+ all_links.update(links)
76
+ except Exception as e:
77
+ print(f"Error in thread: {e}")
78
+
79
+ # Recursively scrape the new set of links
80
+ if current_depth + 1 <= max_depth:
81
+ new_links = scrape_recursive(all_links, max_depth, current_depth + 1, executor)
82
+ all_links.update(new_links)
83
+
84
+ return all_links
85
+
86
+ def main():
87
+ # Get input URL and depth from the user
88
+ input_url = input("Enter the URL to scrape: ")
89
+ max_depth = int(input("Enter the maximum depth: "))
90
+
91
+ # ThreadPoolExecutor for multithreading
92
+ with ThreadPoolExecutor(max_workers=10) as executor:
93
+ # Start scraping
94
+ all_links = scrape_recursive([input_url], max_depth, 0, executor)
95
+
96
+ # Save the results to links.txt
97
+ with open("links.txt", "w") as file:
98
+ for link in all_links:
99
+ file.write(f"{link}\n")
100
+
101
+ print(f"\nFound {len(all_links)} links on the page. Saved to links.txt.")
102
+
103
+ if __name__ == "__main__":
104
+ main()
upsert.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+ from pinecone import Pinecone, ServerlessSpec
4
+ import uuid
5
+ from dotenv import load_dotenv
6
+ from bs4 import BeautifulSoup
7
+ import requests
8
+ import time
9
+ import argparse
10
+ from playwright.sync_api import sync_playwright
11
+
12
+ load_dotenv()
13
+
14
+ # Set up OpenAI client
15
+ client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
16
+
17
+ # Set up Pinecone
18
+ pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
19
+
20
+ index_name = "lyca" # Your index name
21
+
22
+ def ensure_index_exists():
23
+ try:
24
+ index = pc.Index(index_name)
25
+ print(f"Index '{index_name}' already exists.")
26
+ except Exception as e:
27
+ print(f"Index '{index_name}' does not exist. Creating it now...")
28
+ pc.create_index(
29
+ name=index_name,
30
+ dimension=3072, # Dimension for text-embedding-3-large
31
+ metric="cosine",
32
+ spec=ServerlessSpec(
33
+ cloud="aws",
34
+ region="us-west-2"
35
+ )
36
+ )
37
+ print(f"Index '{index_name}' created successfully.")
38
+
39
+ return pc.Index(index_name)
40
+
41
+ def get_embedding(text):
42
+ response = client.embeddings.create(input=text, model="text-embedding-3-large")
43
+ return response.data[0].embedding
44
+
45
+ def process_web_link(url):
46
+ try:
47
+ with sync_playwright() as p:
48
+ browser = p.chromium.launch(headless=True)
49
+ page = browser.new_page()
50
+ page.goto(url)
51
+
52
+ # Wait for the content to load
53
+ time.sleep(5) # Adjust this value if needed
54
+
55
+ # Get the full page content
56
+ content = page.content()
57
+
58
+ browser.close()
59
+
60
+ # Parse the page content using BeautifulSoup
61
+ soup = BeautifulSoup(content, 'lxml')
62
+
63
+ # Remove script and style elements
64
+ for script in soup(["script", "style"]):
65
+ script.decompose()
66
+
67
+ # Get text
68
+ text = soup.get_text()
69
+
70
+ # Clean up the text
71
+ lines = (line.strip() for line in text.splitlines())
72
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
73
+ text = '\n'.join(chunk for chunk in chunks if chunk)
74
+
75
+ return text
76
+ except Exception as e:
77
+ print(f"Error processing web link {url}: {str(e)}")
78
+ return f"Error processing {url}: {str(e)}"
79
+
80
+ def process_and_upsert_link(url, index):
81
+ print(f"Processing {url}")
82
+ content = process_web_link(url)
83
+ doc_id = str(uuid.uuid4())
84
+ content = content[:5000]
85
+ content_length = len(content)
86
+ print(f"Content extracted, length: {content_length}")
87
+
88
+ embedding = get_embedding(content)
89
+ vector = (doc_id, embedding, {
90
+ "text": content,
91
+ "type": "Web Link",
92
+ "doc_id": doc_id,
93
+ "doc_name": url,
94
+ "chunk_index": 0
95
+ })
96
+
97
+ print(f"Generated vector for {url}")
98
+
99
+ index.upsert(vectors=[vector])
100
+ print(f"Vector upserted to Pinecone for {url}")
101
+
102
+ def clean_database(index):
103
+ try:
104
+ print("Cleaning the database...")
105
+ index.delete(delete_all=True)
106
+ print("Database cleaned.")
107
+ except Exception as e:
108
+ print(f"Error cleaning database: {str(e)}")
109
+ print("Continuing with the script...")
110
+
111
+ def main():
112
+ parser = argparse.ArgumentParser(description="Process web links and upsert to Pinecone.")
113
+ parser.add_argument("--clean", action="store_true", help="Clean the database before upserting")
114
+ args = parser.parse_args()
115
+
116
+ index = ensure_index_exists()
117
+
118
+ if args.clean:
119
+ clean_database(index)
120
+
121
+ with open('links.txt', 'r') as file:
122
+ links = [line.strip() for line in file if line.strip()]
123
+
124
+ for link in links:
125
+ process_and_upsert_link(link, index)
126
+
127
+ if __name__ == "__main__":
128
+ main()