PabloVD commited on
Commit
2ccbf76
1 Parent(s): f3576a5

Replace pdf loading by urls loading

Browse files
Files changed (2) hide show
  1. app.py +16 -17
  2. urls.txt +42 -0
app.py CHANGED
@@ -22,15 +22,15 @@ rate_limiter = InMemoryRateLimiter(
22
  max_bucket_size=10, # Controls the maximum burst size.
23
  )
24
 
25
- # Get data from url
26
- url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/'
27
- r = requests.get(url, stream=True)
28
- document_path = Path('data.pdf')
29
-
30
- document_path.write_bytes(r.content)
31
- # document_path = "camels-readthedocs-io-en-latest.pdf"
32
- loader = PyPDFLoader(document_path)
33
- docs = loader.load()
34
 
35
  # # Load, chunk and index the contents of the blog.
36
  # url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
@@ -70,15 +70,14 @@ docs = loader.load()
70
  # # base_url = "https://carla.readthedocs.io/en/latest/"
71
  # # urls = get_subpages(base_url)
72
 
73
- # tokenfile = open("urls.txt")
74
- # urls = tokenfile.readlines()
75
- # urls = [url.replace("\n","") for url in urls]
76
- # tokenfile.close()
77
- # print(urls)
78
 
79
- # # Load, chunk and index the contents of the blog.
80
- # loader = WebBaseLoader(urls)
81
- # docs = loader.load()
82
 
83
  def format_docs(docs):
84
  return "\n\n".join(doc.page_content for doc in docs)
 
22
  max_bucket_size=10, # Controls the maximum burst size.
23
  )
24
 
25
+ # # Get data from url
26
+ # url = 'https://camels.readthedocs.io/_/downloads/en/latest/pdf/'
27
+ # r = requests.get(url, stream=True)
28
+ # document_path = Path('data.pdf')
29
+
30
+ # document_path.write_bytes(r.content)
31
+ # # document_path = "camels-readthedocs-io-en-latest.pdf"
32
+ # loader = PyPDFLoader(document_path)
33
+ # docs = loader.load()
34
 
35
  # # Load, chunk and index the contents of the blog.
36
  # url = "https://lilianweng.github.io/posts/2023-06-23-agent/"
 
70
  # # base_url = "https://carla.readthedocs.io/en/latest/"
71
  # # urls = get_subpages(base_url)
72
 
73
+ urlsfile = open("urls.txt")
74
+ urls = urlsfile.readlines()
75
+ urls = [url.replace("\n","") for url in urls]
76
+ urlsfile.close()
 
77
 
78
+ # Load, chunk and index the contents of the blog.
79
+ loader = WebBaseLoader(urls)
80
+ docs = loader.load()
81
 
82
  def format_docs(docs):
83
  return "\n\n".join(doc.page_content for doc in docs)
urls.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ https://camels.readthedocs.io/en/latest/
2
+ https://camels.readthedocs.io/en/latest/news.html
3
+ https://camels.readthedocs.io/en/latest/goals.html
4
+ https://camels.readthedocs.io/en/latest/publications.html
5
+ https://camels.readthedocs.io/en/latest/data_access.html
6
+ https://camels.readthedocs.io/en/latest/citation.html
7
+ https://camels.readthedocs.io/en/latest/description.html
8
+ https://camels.readthedocs.io/en/latest/suites_sets.html
9
+ https://camels.readthedocs.io/en/latest/codes.html
10
+ https://camels.readthedocs.io/en/latest/parameters.html
11
+ https://camels.readthedocs.io/en/latest/organization.html
12
+ https://camels.readthedocs.io/en/latest/snapshots.html
13
+ https://camels.readthedocs.io/en/latest/subfind.html
14
+ https://camels.readthedocs.io/en/latest/SubLink.html
15
+ https://camels.readthedocs.io/en/latest/rockstar.html
16
+ https://camels.readthedocs.io/en/latest/ahf.html
17
+ https://camels.readthedocs.io/en/latest/caesar.html
18
+ https://camels.readthedocs.io/en/latest/Pk.html
19
+ https://camels.readthedocs.io/en/latest/Bk.html
20
+ https://camels.readthedocs.io/en/latest/pdf.html
21
+ https://camels.readthedocs.io/en/latest/VIDE.html
22
+ https://camels.readthedocs.io/en/latest/Lya.html
23
+ https://camels.readthedocs.io/en/latest/Xrays.html
24
+ https://camels.readthedocs.io/en/latest/Profiles.html
25
+ https://camels.readthedocs.io/en/latest/CMD.html
26
+ https://camels.readthedocs.io/en/latest/SAM.html
27
+ https://camels.readthedocs.io/en/latest/zoomGZ.html
28
+ https://camels.readthedocs.io/en/latest/tutorials.html
29
+ https://camels.readthedocs.io/en/latest/images.html
30
+ https://camels.readthedocs.io/en/latest/camels_library.html
31
+ https://camels.readthedocs.io/en/latest/pylians3.html
32
+ https://camels.readthedocs.io/en/latest/team.html
33
+ https://camels.readthedocs.io/en/latest/contact.html
34
+ https://camels.readthedocs.io/en/latest/logo.html
35
+ https://camels.readthedocs.io/en/latest/examples/Reading_Manipulating_Snapshots.html
36
+ https://camels.readthedocs.io/en/latest/examples/Pk.html
37
+ https://camels.readthedocs.io/en/latest/examples/Images.html
38
+ https://camels.readthedocs.io/en/latest/examples/particles_subhalos.html
39
+ https://camels.readthedocs.io/en/latest/index.html
40
+ https://camels.readthedocs.io/en/latest/Images.html
41
+ https://camels.readthedocs.io/en/latest/particles_subhalos.html
42
+ https://camels.readthedocs.io/en/latest/Reading_Manipulating_Snapshots.html