Rauhan commited on
Commit
736af94
1 Parent(s): 937797f

UPDATE: urls

Browse files
Files changed (2) hide show
  1. app.py +5 -17
  2. functions.py +3 -8
app.py CHANGED
@@ -33,30 +33,20 @@ async def newChatbot(chatbotName: str, username: str):
33
  return createTable(tablename = chatbotName)
34
 
35
 
36
- @app.post("/addPDF")
37
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
38
  pdf = await pdf.read()
39
  reader = PdfReader(io.BytesIO(pdf))
40
  text = ""
41
  for page in reader.pages:
42
  text += page.extract_text()
43
- return addDocuments(text = text, vectorstore = vectorstore)
44
 
45
 
46
- @app.post("/addText")
47
  async def addText(vectorstore: str, text: str):
48
  return addDocuments(text = text, vectorstore = vectorstore)
49
 
50
-
51
- @app.post("/addWebsite")
52
- async def addWebsite(vectorstore: str, websiteUrl: str):
53
- urls = getLinks(websiteUrl)
54
- loader = UnstructuredURLLoader(urls=urls)
55
- docs = loader.load()
56
- text = "\n\n\n\n".join([f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
57
- return addDocuments(text = text, vectorstore = vectorstore)
58
-
59
-
60
  @app.post("/answerQuery")
61
  async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
62
  return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
@@ -70,8 +60,6 @@ async def delete(chatbotName: str):
70
  async def delete(username: str):
71
  return listTables(username=username)
72
 
73
- @app.post("/getLinks")
74
  async def crawlUrl(baseUrl: str):
75
- return {
76
- "urls": getLinks(url=baseUrl, timeout=30)
77
- }
 
33
  return createTable(tablename = chatbotName)
34
 
35
 
36
+ @app.post("/getRawPDFText")
37
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
38
  pdf = await pdf.read()
39
  reader = PdfReader(io.BytesIO(pdf))
40
  text = ""
41
  for page in reader.pages:
42
  text += page.extract_text()
43
+ return text
44
 
45
 
46
+ @app.post("/addData")
47
  async def addText(vectorstore: str, text: str):
48
  return addDocuments(text = text, vectorstore = vectorstore)
49
 
 
 
 
 
 
 
 
 
 
 
50
  @app.post("/answerQuery")
51
  async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-70b-8192"):
52
  return answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
 
60
  async def delete(username: str):
61
  return listTables(username=username)
62
 
63
+ @app.post("/getWebsiteData")
64
  async def crawlUrl(baseUrl: str):
65
+ return getRawWebText(url=baseUrl, timeout=30)
 
 
functions.py CHANGED
@@ -258,8 +258,7 @@ def listTables(username: str):
258
  }
259
 
260
 
261
-
262
- def getLinks(url: str, timeout = 30):
263
  start = time.time()
264
  def getLinksFromPage(url: str) -> list:
265
  response = requests.get(url)
@@ -290,9 +289,5 @@ def getLinks(url: str, timeout = 30):
290
  allLinks = {}
291
  foundLinks = list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
292
  for link in foundLinks:
293
- allLinks[link] = len(BeautifulSoup(requests.get(link).text, "lxml").body.get_text(" ", strip = True))
294
- return allLinks
295
-
296
-
297
- def getTextLength(text: str):
298
- return len(text)
 
258
  }
259
 
260
 
261
+ def getRawWebText(url: str, timeout = 30):
 
262
  start = time.time()
263
  def getLinksFromPage(url: str) -> list:
264
  response = requests.get(url)
 
289
  allLinks = {}
290
  foundLinks = list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
291
  for link in foundLinks:
292
+ allLinks[link] = BeautifulSoup(requests.get(link).text, "lxml").body.get_text(" ", strip = True)
293
+ return allLinks