PabloVD commited on
Commit
e19a241
1 Parent(s): 2ccbf76

Request urls instead of reading from file, clean code and improve readability

Browse files
Files changed (2) hide show
  1. app.py +27 -54
  2. urls.txt +0 -42
app.py CHANGED
@@ -4,84 +4,57 @@ from langchain import hub
4
  from langchain_chroma import Chroma
5
  from langchain_core.output_parsers import StrOutputParser
6
  from langchain_core.runnables import RunnablePassthrough
7
- from langchain_mistralai import MistralAIEmbeddings
8
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_mistralai import ChatMistralAI
11
- from langchain_community.document_loaders import PyPDFLoader
12
  import requests
13
- from pathlib import Path
14
  from langchain_community.document_loaders import WebBaseLoader
15
  import bs4
16
  from langchain_core.rate_limiters import InMemoryRateLimiter
17
  from urllib.parse import urljoin
18
 
 
19
  rate_limiter = InMemoryRateLimiter(
20
  requests_per_second=0.1, # <-- MistralAI free. We can only make a request once every second
21
  check_every_n_seconds=0.01, # Wake up every 100 ms to check whether allowed to make a request,
22
  max_bucket_size=10, # Controls the maximum burst size.
23
  )
24
 
25
- # # Get data from url
26
- # url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/'
27
- # r = requests.get(url, stream=True)
28
- # document_path = Path('data.pdf')
29
-
30
- # document_path.write_bytes(r.content)
31
- # # document_path = "camels-readthedocs-io-en-latest.pdf"
32
- # loader = PyPDFLoader(document_path)
33
- # docs = loader.load()
34
-
35
- # # Load, chunk and index the contents of the blog.
36
- # url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
37
- # loader = WebBaseLoader(
38
- # web_paths=(url,),
39
- # bs_kwargs=dict(
40
- # parse_only=bs4.SoupStrainer(
41
- # class_=("post-content", "post-title", "post-header")
42
- # )
43
- # ),
44
- # )
45
- # loader = WebBaseLoader(url)
46
- # docs = loader.load()
47
-
48
- # def get_subpages(base_url):
49
- # visited_urls = []
50
- # urls_to_visit = [base_url]
51
-
52
- # while urls_to_visit:
53
- # url = urls_to_visit.pop(0)
54
- # if url in visited_urls:
55
- # continue
56
-
57
- # visited_urls.append(url)
58
- # response = requests.get(url)
59
- # soup = bs4.BeautifulSoup(response.content, "html.parser")
60
 
61
- # for link in soup.find_all("a", href=True):
62
- # full_url = urljoin(base_url, link['href'])
63
- # if base_url in full_url and not full_url.endswith(".html") and full_url not in visited_urls:
64
- # urls_to_visit.append(full_url)
65
- # visited_urls = visited_urls[1:]
 
 
 
66
 
67
- # return visited_urls
 
 
 
 
68
 
69
- # base_url = "https://camels.readthedocs.io/en/latest/"
70
- # # base_url = "https://carla.readthedocs.io/en/latest/"
71
- # # urls = get_subpages(base_url)
72
 
73
- urlsfile = open("urls.txt")
74
- urls = urlsfile.readlines()
75
- urls = [url.replace("\n","") for url in urls]
76
- urlsfile.close()
77
 
78
  # Load, chunk and index the contents of the blog.
79
  loader = WebBaseLoader(urls)
80
  docs = loader.load()
81
 
 
82
  def format_docs(docs):
83
  return "\n\n".join(doc.page_content for doc in docs)
84
 
 
85
  def RAG(llm, docs, embeddings):
86
 
87
  # Split text
@@ -114,7 +87,6 @@ llm = ChatMistralAI(model="mistral-large-latest", rate_limiter=rate_limiter)
114
  embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
115
  # embed_model = "nvidia/NV-Embed-v2"
116
  embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model)
117
- # embeddings = MistralAIEmbeddings()
118
 
119
  # RAG chain
120
  rag_chain = RAG(llm, docs, embeddings)
@@ -131,12 +103,13 @@ def handle_prompt(message, history):
131
 
132
  greetingsmessage = "Hi, I'm the CAMELS DocBot, I'm here to assist you with any question related to the CAMELS simulations documentation"
133
  example_questions = [
134
- "How can i read a halo file?",
135
  "Which simulation suites are included in CAMELS?",
136
  "Which are the largest volumes in CAMELS simulations?",
137
- "How can I get the power spectrum of a simulation?"
138
  ]
139
 
 
140
  demo = gr.ChatInterface(handle_prompt, type="messages", title="CAMELS DocBot", examples=example_questions, theme=gr.themes.Soft(), description=greetingsmessage)#, chatbot=chatbot)
141
 
142
  demo.launch()
 
4
  from langchain_chroma import Chroma
5
  from langchain_core.output_parsers import StrOutputParser
6
  from langchain_core.runnables import RunnablePassthrough
 
7
  from langchain_community.embeddings import HuggingFaceInstructEmbeddings
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain_mistralai import ChatMistralAI
 
10
  import requests
 
11
  from langchain_community.document_loaders import WebBaseLoader
12
  import bs4
13
  from langchain_core.rate_limiters import InMemoryRateLimiter
14
  from urllib.parse import urljoin
15
 
16
+ # Define a limiter to avoid rate limit issues with MistralAI
17
  rate_limiter = InMemoryRateLimiter(
18
  requests_per_second=0.1, # <-- MistralAI free. We can only make a request once every second
19
  check_every_n_seconds=0.01, # Wake up every 100 ms to check whether allowed to make a request,
20
  max_bucket_size=10, # Controls the maximum burst size.
21
  )
22
 
23
+ # Function to get all the subpages from a base url
24
+ def get_subpages(base_url):
25
+ visited_urls = []
26
+ urls_to_visit = [base_url]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ while urls_to_visit:
29
+ url = urls_to_visit.pop(0)
30
+ if url in visited_urls:
31
+ continue
32
+
33
+ visited_urls.append(url)
34
+ response = requests.get(url)
35
+ soup = bs4.BeautifulSoup(response.content, "html.parser")
36
 
37
+ for link in soup.find_all("a", href=True):
38
+ full_url = urljoin(base_url, link['href'])
39
+ if base_url in full_url and full_url.endswith(".html") and full_url not in visited_urls:
40
+ urls_to_visit.append(full_url)
41
+ visited_urls = visited_urls[1:]
42
 
43
+ return visited_urls
 
 
44
 
45
+ # Get urls
46
+ base_url = "https://camels.readthedocs.io/en/latest/"
47
+ urls = get_subpages(base_url)
 
48
 
49
  # Load, chunk and index the contents of the blog.
50
  loader = WebBaseLoader(urls)
51
  docs = loader.load()
52
 
53
+ # Join content pages for processing
54
  def format_docs(docs):
55
  return "\n\n".join(doc.page_content for doc in docs)
56
 
57
+ # Create a RAG chain
58
  def RAG(llm, docs, embeddings):
59
 
60
  # Split text
 
87
  embed_model = "sentence-transformers/multi-qa-distilbert-cos-v1"
88
  # embed_model = "nvidia/NV-Embed-v2"
89
  embeddings = HuggingFaceInstructEmbeddings(model_name=embed_model)
 
90
 
91
  # RAG chain
92
  rag_chain = RAG(llm, docs, embeddings)
 
103
 
104
  greetingsmessage = "Hi, I'm the CAMELS DocBot, I'm here to assist you with any question related to the CAMELS simulations documentation"
105
  example_questions = [
106
+ "How can I read a halo file?",
107
  "Which simulation suites are included in CAMELS?",
108
  "Which are the largest volumes in CAMELS simulations?",
109
+ "Write a complete snippet of code getting the power spectrum of a simulation"
110
  ]
111
 
112
+ # Define Gradio interface
113
  demo = gr.ChatInterface(handle_prompt, type="messages", title="CAMELS DocBot", examples=example_questions, theme=gr.themes.Soft(), description=greetingsmessage)#, chatbot=chatbot)
114
 
115
  demo.launch()
urls.txt DELETED
@@ -1,42 +0,0 @@
1
- https://camels.readthedocs.io/en/latest/
2
- https://camels.readthedocs.io/en/latest/news.html
3
- https://camels.readthedocs.io/en/latest/goals.html
4
- https://camels.readthedocs.io/en/latest/publications.html
5
- https://camels.readthedocs.io/en/latest/data_access.html
6
- https://camels.readthedocs.io/en/latest/citation.html
7
- https://camels.readthedocs.io/en/latest/description.html
8
- https://camels.readthedocs.io/en/latest/suites_sets.html
9
- https://camels.readthedocs.io/en/latest/codes.html
10
- https://camels.readthedocs.io/en/latest/parameters.html
11
- https://camels.readthedocs.io/en/latest/organization.html
12
- https://camels.readthedocs.io/en/latest/snapshots.html
13
- https://camels.readthedocs.io/en/latest/subfind.html
14
- https://camels.readthedocs.io/en/latest/SubLink.html
15
- https://camels.readthedocs.io/en/latest/rockstar.html
16
- https://camels.readthedocs.io/en/latest/ahf.html
17
- https://camels.readthedocs.io/en/latest/caesar.html
18
- https://camels.readthedocs.io/en/latest/Pk.html
19
- https://camels.readthedocs.io/en/latest/Bk.html
20
- https://camels.readthedocs.io/en/latest/pdf.html
21
- https://camels.readthedocs.io/en/latest/VIDE.html
22
- https://camels.readthedocs.io/en/latest/Lya.html
23
- https://camels.readthedocs.io/en/latest/Xrays.html
24
- https://camels.readthedocs.io/en/latest/Profiles.html
25
- https://camels.readthedocs.io/en/latest/CMD.html
26
- https://camels.readthedocs.io/en/latest/SAM.html
27
- https://camels.readthedocs.io/en/latest/zoomGZ.html
28
- https://camels.readthedocs.io/en/latest/tutorials.html
29
- https://camels.readthedocs.io/en/latest/images.html
30
- https://camels.readthedocs.io/en/latest/camels_library.html
31
- https://camels.readthedocs.io/en/latest/pylians3.html
32
- https://camels.readthedocs.io/en/latest/team.html
33
- https://camels.readthedocs.io/en/latest/contact.html
34
- https://camels.readthedocs.io/en/latest/logo.html
35
- https://camels.readthedocs.io/en/latest/examples/Reading_Manipulating_Snapshots.html
36
- https://camels.readthedocs.io/en/latest/examples/Pk.html
37
- https://camels.readthedocs.io/en/latest/examples/Images.html
38
- https://camels.readthedocs.io/en/latest/examples/particles_subhalos.html
39
- https://camels.readthedocs.io/en/latest/index.html
40
- https://camels.readthedocs.io/en/latest/Images.html
41
- https://camels.readthedocs.io/en/latest/particles_subhalos.html
42
- https://camels.readthedocs.io/en/latest/Reading_Manipulating_Snapshots.html