Spaces:
Sleeping
Sleeping
DEBUG: FlashRank
Browse files- app.py +77 -67
- functions.py +8 -11
- requirements.txt +2 -1
app.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1 |
import io
|
2 |
import tempfile
|
|
|
|
|
|
|
3 |
from starlette import status
|
4 |
from functions import *
|
5 |
import pandas as pd
|
@@ -13,7 +16,6 @@ import nltk
|
|
13 |
import time
|
14 |
import uuid
|
15 |
|
16 |
-
|
17 |
nltk.download('punkt_tab')
|
18 |
|
19 |
app = FastAPI(title="ConversAI", root_path="/api/v1")
|
@@ -48,11 +50,24 @@ async def sign_up(email, username, password):
|
|
48 |
|
49 |
|
50 |
@app.post("/session-check")
|
51 |
-
async def check_session():
|
52 |
res = supabase.auth.get_session()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
return res
|
55 |
|
|
|
56 |
@app.post("/get-user")
|
57 |
async def get_user(access_token):
|
58 |
res = supabase.auth.get_user(jwt=access_token)
|
@@ -65,7 +80,6 @@ async def refresh_token(refresh_token):
|
|
65 |
return res
|
66 |
|
67 |
|
68 |
-
|
69 |
@app.post("/login")
|
70 |
async def sign_in(email, password):
|
71 |
try:
|
@@ -127,68 +141,55 @@ async def sign_in(email, password):
|
|
127 |
)
|
128 |
|
129 |
|
130 |
-
@app.post(
|
131 |
-
async def login_with_token(
|
132 |
try:
|
133 |
-
|
134 |
-
print(res)
|
135 |
-
user_id = res.user.id
|
136 |
-
access_token = res.session.access_token
|
137 |
-
refresh_token = res.session.refresh_token
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
supabase.table("Stores").insert(
|
148 |
-
{
|
149 |
-
"AccessToken": access_token,
|
150 |
-
"StoreID": user_id,
|
151 |
-
"RefreshToken": refresh_token,
|
152 |
-
}
|
153 |
-
).execute()
|
154 |
-
)
|
155 |
|
156 |
-
|
157 |
-
|
158 |
-
"code": status.HTTP_200_OK,
|
159 |
-
"user_id": user_id,
|
160 |
-
"access_token": access_token,
|
161 |
-
"refresh_token": refresh_token
|
162 |
-
}
|
163 |
-
return message
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
status_code=status.HTTP_400_BAD_REQUEST,
|
168 |
-
detail="You are already signed in. Please sign out first to sign in again."
|
169 |
-
)
|
170 |
|
171 |
-
else:
|
172 |
-
raise HTTPException(
|
173 |
-
status_code=status.HTTP_400_BAD_REQUEST,
|
174 |
-
detail="Failed to sign in. Please check your credentials."
|
175 |
-
)
|
176 |
-
|
177 |
-
except HTTPException as http_exc:
|
178 |
-
raise http_exc
|
179 |
-
|
180 |
-
except Exception as e:
|
181 |
-
raise HTTPException(
|
182 |
-
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
183 |
-
detail=f"An unexpected error occurred during sign-in: {str(e)}"
|
184 |
-
)
|
185 |
|
|
|
|
|
|
|
|
|
186 |
|
187 |
|
188 |
@app.post("/set-session-data")
|
189 |
-
async def set_session_data(access_token, refresh_token):
|
190 |
res = supabase.auth.set_session(access_token, refresh_token)
|
191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
return res
|
193 |
|
194 |
|
@@ -207,8 +208,9 @@ async def sign_out(user_id):
|
|
207 |
|
208 |
|
209 |
@app.post("/oauth")
|
210 |
-
async def oauth(
|
211 |
-
res = supabase.auth.sign_in_with_oauth(
|
|
|
212 |
return res
|
213 |
|
214 |
|
@@ -259,7 +261,8 @@ async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
|
|
259 |
with open(f"{fileId}.txt", "w") as file:
|
260 |
file.write(newText)
|
261 |
with open(f"{fileId}.txt", "rb") as f:
|
262 |
-
supabase.storage.from_("ConversAI").upload(file
|
|
|
263 |
os.remove(f"{fileId}.txt")
|
264 |
output["supabaseFileName"] = f"{fileId}.txt"
|
265 |
return output
|
@@ -280,7 +283,7 @@ async def returnText(pdf: UploadFile = File(...)):
|
|
280 |
return {
|
281 |
"source": source,
|
282 |
"extractionTime": timeTaken,
|
283 |
-
"output": text
|
284 |
}
|
285 |
|
286 |
|
@@ -307,7 +310,8 @@ async def addText(vectorstore: str, text: str, source: str | None = None):
|
|
307 |
with open(f"{fileId}.txt", "w") as file:
|
308 |
file.write(newText)
|
309 |
with open(f"{fileId}.txt", "rb") as f:
|
310 |
-
supabase.storage.from_("ConversAI").upload(file
|
|
|
311 |
os.remove(f"{fileId}.txt")
|
312 |
output["supabaseFileName"] = f"{fileId}.txt"
|
313 |
return output
|
@@ -345,13 +349,14 @@ async def addQAPairData(addQaPair: AddQAPair):
|
|
345 |
@app.post("/addWebsite")
|
346 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
347 |
start = time.time()
|
348 |
-
text = extractTextFromUrlList(urls
|
349 |
textExtraction = time.time()
|
350 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
351 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
352 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
353 |
newCount = currentCount + len(text)
|
354 |
-
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
|
|
355 |
if newCount < int(limit):
|
356 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
357 |
"chatbotname", chatbotname).execute()
|
@@ -364,12 +369,14 @@ async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
|
364 |
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
365 |
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
366 |
links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
|
367 |
-
newText = ("=" * 75 + "\n").join(
|
|
|
368 |
fileId = str(uuid.uuid4())
|
369 |
with open(f"{fileId}.txt", "w") as file:
|
370 |
file.write(newText)
|
371 |
with open(f"{fileId}.txt", "rb") as f:
|
372 |
-
supabase.storage.from_("ConversAI").upload(file
|
|
|
373 |
os.remove(f"{fileId}.txt")
|
374 |
output["supabaseFileName"] = f"{fileId}.txt"
|
375 |
return output
|
@@ -385,7 +392,8 @@ async def answerQuestion(query: str, vectorstore: str, llmModel: str = "llama3-7
|
|
385 |
output = answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
|
386 |
response = (
|
387 |
supabase.table("ConversAI_ChatHistory")
|
388 |
-
.insert({"username": username, "chatbotName": chatbotName, "llmModel": llmModel, "question": query,
|
|
|
389 |
.execute()
|
390 |
)
|
391 |
return output
|
@@ -450,5 +458,7 @@ async def analyzeAndAnswer(query: str, file: UploadFile = File(...)):
|
|
450 |
@app.post("/getChatHistory")
|
451 |
async def chatHistory(vectorstore: str):
|
452 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
453 |
-
response = supabase.table("ConversAI_ChatHistory").select("timestamp", "question", "response").eq("username",
|
454 |
-
|
|
|
|
|
|
1 |
import io
|
2 |
import tempfile
|
3 |
+
import jwt
|
4 |
+
from click import option
|
5 |
+
from jwt import ExpiredSignatureError, InvalidTokenError
|
6 |
from starlette import status
|
7 |
from functions import *
|
8 |
import pandas as pd
|
|
|
16 |
import time
|
17 |
import uuid
|
18 |
|
|
|
19 |
nltk.download('punkt_tab')
|
20 |
|
21 |
app = FastAPI(title="ConversAI", root_path="/api/v1")
|
|
|
50 |
|
51 |
|
52 |
@app.post("/session-check")
|
53 |
+
async def check_session(user_id: str):
|
54 |
res = supabase.auth.get_session()
|
55 |
+
if res == None:
|
56 |
+
try:
|
57 |
+
supabase.table("Stores").delete().eq(
|
58 |
+
"StoreID", user_id
|
59 |
+
).execute()
|
60 |
+
resp = supabase.auth.sign_out()
|
61 |
+
|
62 |
+
response = {"message": "success", "code": 200, "Session": res}
|
63 |
+
|
64 |
+
return response
|
65 |
+
except Exception as e:
|
66 |
+
raise HTTPException(status_code=400, detail=str(e))
|
67 |
|
68 |
return res
|
69 |
|
70 |
+
|
71 |
@app.post("/get-user")
|
72 |
async def get_user(access_token):
|
73 |
res = supabase.auth.get_user(jwt=access_token)
|
|
|
80 |
return res
|
81 |
|
82 |
|
|
|
83 |
@app.post("/login")
|
84 |
async def sign_in(email, password):
|
85 |
try:
|
|
|
141 |
)
|
142 |
|
143 |
|
144 |
+
@app.post("/login_with_token")
|
145 |
+
async def login_with_token(access_token: str, refresh_token: str):
|
146 |
try:
|
147 |
+
decoded_token = jwt.decode(access_token, options={"verify_signature": False})
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
json = {
|
150 |
+
"code": status.HTTP_200_OK,
|
151 |
+
"user_id": decoded_token.get("sub"),
|
152 |
+
"email": decoded_token.get("email"),
|
153 |
+
"access_token": access_token,
|
154 |
+
"refresh_token": refresh_token,
|
155 |
+
"issued_at": decoded_token.get("iat"),
|
156 |
+
"expires_at": decoded_token.get("exp")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
+
}
|
159 |
+
return json
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
|
161 |
+
except (ExpiredSignatureError, InvalidTokenError) as e:
|
162 |
+
raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail=str(e))
|
|
|
|
|
|
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
+
@app.post("/user_name")
|
166 |
+
async def user_name_(username: str, user_id: str):
|
167 |
+
r_ = createUser(user_id=user_id, username=username)
|
168 |
+
return r_
|
169 |
|
170 |
|
171 |
@app.post("/set-session-data")
|
172 |
+
async def set_session_data(access_token, refresh_token, user_id):
|
173 |
res = supabase.auth.set_session(access_token, refresh_token)
|
174 |
+
store_session_check = supabase.table("Stores").select("*").filter("StoreID", "eq", user_id).execute()
|
175 |
+
store_id = None
|
176 |
+
if store_session_check and store_session_check.data:
|
177 |
+
store_id = store_session_check.data[0].get("StoreID")
|
178 |
+
if not store_id:
|
179 |
+
response = (
|
180 |
+
supabase.table("Stores").insert(
|
181 |
+
{
|
182 |
+
"AccessToken": access_token,
|
183 |
+
"StoreID": user_id,
|
184 |
+
"RefreshToken": refresh_token,
|
185 |
+
}
|
186 |
+
).execute()
|
187 |
+
)
|
188 |
+
res = {
|
189 |
+
"message": "success",
|
190 |
+
"code": 200,
|
191 |
+
"session_data": res,
|
192 |
+
}
|
193 |
return res
|
194 |
|
195 |
|
|
|
208 |
|
209 |
|
210 |
@app.post("/oauth")
|
211 |
+
async def oauth():
|
212 |
+
res = supabase.auth.sign_in_with_oauth(
|
213 |
+
{"provider": "google", "options": {"redirect_to": "https://convers-ai-lac.vercel.app/"}})
|
214 |
return res
|
215 |
|
216 |
|
|
|
261 |
with open(f"{fileId}.txt", "w") as file:
|
262 |
file.write(newText)
|
263 |
with open(f"{fileId}.txt", "rb") as f:
|
264 |
+
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
265 |
+
file_options={"content-type": "text/plain"})
|
266 |
os.remove(f"{fileId}.txt")
|
267 |
output["supabaseFileName"] = f"{fileId}.txt"
|
268 |
return output
|
|
|
283 |
return {
|
284 |
"source": source,
|
285 |
"extractionTime": timeTaken,
|
286 |
+
"output": text
|
287 |
}
|
288 |
|
289 |
|
|
|
310 |
with open(f"{fileId}.txt", "w") as file:
|
311 |
file.write(newText)
|
312 |
with open(f"{fileId}.txt", "rb") as f:
|
313 |
+
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
314 |
+
file_options={"content-type": "text/plain"})
|
315 |
os.remove(f"{fileId}.txt")
|
316 |
output["supabaseFileName"] = f"{fileId}.txt"
|
317 |
return output
|
|
|
349 |
@app.post("/addWebsite")
|
350 |
async def addWebsite(vectorstore: str, websiteUrls: list[str]):
|
351 |
start = time.time()
|
352 |
+
text = extractTextFromUrlList(urls=websiteUrls)
|
353 |
textExtraction = time.time()
|
354 |
username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
355 |
df = pd.DataFrame(supabase.table("ConversAI_ChatbotInfo").select("*").execute().data)
|
356 |
currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
|
357 |
newCount = currentCount + len(text)
|
358 |
+
limit = supabase.table("ConversAI_UserConfig").select("tokenLimit").eq("user_id", username).execute().data[0][
|
359 |
+
"tokenLimit"]
|
360 |
if newCount < int(limit):
|
361 |
supabase.table("ConversAI_ChatbotInfo").update({"charactercount": str(newCount)}).eq("user_id", username).eq(
|
362 |
"chatbotname", chatbotname).execute()
|
|
|
369 |
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
370 |
wordCount = f"WORD COUNT: {len(tokenizer.tokenize(text))}" + "\n"
|
371 |
links = "LINKS:\n" + "\n".join(websiteUrls) + "\n"
|
372 |
+
newText = ("=" * 75 + "\n").join(
|
373 |
+
[timeTaken, uploadTime, wordCount, tokenCount, links, "TEXT: \n" + text + "\n"])
|
374 |
fileId = str(uuid.uuid4())
|
375 |
with open(f"{fileId}.txt", "w") as file:
|
376 |
file.write(newText)
|
377 |
with open(f"{fileId}.txt", "rb") as f:
|
378 |
+
supabase.storage.from_("ConversAI").upload(file=f, path=os.path.join("/", f.name),
|
379 |
+
file_options={"content-type": "text/plain"})
|
380 |
os.remove(f"{fileId}.txt")
|
381 |
output["supabaseFileName"] = f"{fileId}.txt"
|
382 |
return output
|
|
|
392 |
output = answerQuery(query=query, vectorstore=vectorstore, llmModel=llmModel)
|
393 |
response = (
|
394 |
supabase.table("ConversAI_ChatHistory")
|
395 |
+
.insert({"username": username, "chatbotName": chatbotName, "llmModel": llmModel, "question": query,
|
396 |
+
"response": output["output"]})
|
397 |
.execute()
|
398 |
)
|
399 |
return output
|
|
|
458 |
@app.post("/getChatHistory")
|
459 |
async def chatHistory(vectorstore: str):
|
460 |
username, chatbotName = vectorstore.split("$")[1], vectorstore.split("$")[2]
|
461 |
+
response = supabase.table("ConversAI_ChatHistory").select("timestamp", "question", "response").eq("username",
|
462 |
+
username).eq(
|
463 |
+
"chatbotName", chatbotName).execute().data
|
464 |
+
return response
|
functions.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import pymupdf
|
2 |
from concurrent.futures import ThreadPoolExecutor
|
3 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
@@ -45,21 +45,18 @@ vectorEmbeddings = HuggingFaceEmbeddings(
|
|
45 |
encode_kwargs=encode_kwargs
|
46 |
)
|
47 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
48 |
-
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads
|
49 |
prompt = """
|
50 |
INSTRUCTIONS:
|
51 |
=====================================
|
52 |
### Role
|
53 |
**Primary Function**: You are an AI chatbot designed to provide accurate and efficient assistance to users based on provided context data. Your responses must be reliable, friendly, and directly address user inquiries or issues. Always clarify any unclear questions, and conclude responses positively.
|
54 |
-
|
55 |
### Constraints
|
56 |
1. **No Data Disclosure**: Never reveal access to training data or any context explicitly.
|
57 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
58 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
59 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
60 |
-
|
61 |
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Include sources to support your answers when possible.
|
62 |
-
|
63 |
CONTEXT:
|
64 |
=====================================
|
65 |
{context}
|
@@ -67,11 +64,9 @@ CONTEXT:
|
|
67 |
QUESTION:
|
68 |
=====================================
|
69 |
{question}
|
70 |
-
|
71 |
CHAT HISTORY:
|
72 |
=====================================
|
73 |
{chatHistory}
|
74 |
-
|
75 |
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
|
76 |
"""
|
77 |
prompt = ChatPromptTemplate.from_template(prompt)
|
@@ -266,7 +261,7 @@ def getLinks(url: str, timeout=30):
|
|
266 |
else:
|
267 |
pass
|
268 |
links = [link for link in links if "#" not in link]
|
269 |
-
links = list(set(links))
|
270 |
else:
|
271 |
continue
|
272 |
return links
|
@@ -319,18 +314,19 @@ def analyzeData(query, dataframe):
|
|
319 |
return response
|
320 |
|
321 |
|
322 |
-
|
323 |
def extractTextFromPage(page):
|
324 |
return page.get_text()
|
325 |
|
|
|
326 |
def extractTextFromPdf(pdf_path):
|
327 |
doc = pymupdf.open(pdf_path)
|
328 |
pages = [doc.load_page(i) for i in range(len(doc))]
|
329 |
with ThreadPoolExecutor() as executor:
|
330 |
texts = list(executor.map(extractTextFromPage, pages))
|
331 |
-
doc.close()
|
332 |
return '.'.join(texts)
|
333 |
|
|
|
334 |
def extractTextFromUrl(url):
|
335 |
response = requests.get(url)
|
336 |
response.raise_for_status()
|
@@ -338,7 +334,8 @@ def extractTextFromUrl(url):
|
|
338 |
soup = BeautifulSoup(html, 'lxml')
|
339 |
return soup.get_text(separator=' ', strip=True)
|
340 |
|
|
|
341 |
def extractTextFromUrlList(urls):
|
342 |
with ThreadPoolExecutor() as executor:
|
343 |
texts = list(executor.map(extractTextFromUrl, urls))
|
344 |
-
return '.'.join(texts)
|
|
|
1 |
+
import pymupdf
|
2 |
from concurrent.futures import ThreadPoolExecutor
|
3 |
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
|
4 |
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
|
|
|
45 |
encode_kwargs=encode_kwargs
|
46 |
)
|
47 |
reader = easyocr.Reader(['en'], gpu=True, model_storage_directory="/app/EasyOCRModels")
|
48 |
+
sparseEmbeddings = FastEmbedSparse(model="Qdrant/BM25", threads=20, parallel=0)
|
49 |
prompt = """
|
50 |
INSTRUCTIONS:
|
51 |
=====================================
|
52 |
### Role
|
53 |
**Primary Function**: You are an AI chatbot designed to provide accurate and efficient assistance to users based on provided context data. Your responses must be reliable, friendly, and directly address user inquiries or issues. Always clarify any unclear questions, and conclude responses positively.
|
|
|
54 |
### Constraints
|
55 |
1. **No Data Disclosure**: Never reveal access to training data or any context explicitly.
|
56 |
2. **Maintaining Focus**: Politely redirect any off-topic conversations back to relevant issues without breaking character.
|
57 |
3. **Exclusive Reliance on Context Data**: Base all answers strictly on the provided context data. If the context doesn’t cover the query, use a fallback response. Always maintain a third-person perspective.
|
58 |
4. **Restrictive Role Focus**: Do not engage in tasks or answer questions unrelated to your role or context data.
|
|
|
59 |
Ensure all instructions are strictly followed. Responses must be meaningful and concise, within 512 words. Include sources to support your answers when possible.
|
|
|
60 |
CONTEXT:
|
61 |
=====================================
|
62 |
{context}
|
|
|
64 |
QUESTION:
|
65 |
=====================================
|
66 |
{question}
|
|
|
67 |
CHAT HISTORY:
|
68 |
=====================================
|
69 |
{chatHistory}
|
|
|
70 |
NOTE: Generate responses directly without using phrases like "Response:" or "Answer:". Do not mention the use of extracted context or provide unnecessary details.
|
71 |
"""
|
72 |
prompt = ChatPromptTemplate.from_template(prompt)
|
|
|
261 |
else:
|
262 |
pass
|
263 |
links = [link for link in links if "#" not in link]
|
264 |
+
links = list(set(links))
|
265 |
else:
|
266 |
continue
|
267 |
return links
|
|
|
314 |
return response
|
315 |
|
316 |
|
|
|
317 |
def extractTextFromPage(page):
|
318 |
return page.get_text()
|
319 |
|
320 |
+
|
321 |
def extractTextFromPdf(pdf_path):
|
322 |
doc = pymupdf.open(pdf_path)
|
323 |
pages = [doc.load_page(i) for i in range(len(doc))]
|
324 |
with ThreadPoolExecutor() as executor:
|
325 |
texts = list(executor.map(extractTextFromPage, pages))
|
326 |
+
doc.close()
|
327 |
return '.'.join(texts)
|
328 |
|
329 |
+
|
330 |
def extractTextFromUrl(url):
|
331 |
response = requests.get(url)
|
332 |
response.raise_for_status()
|
|
|
334 |
soup = BeautifulSoup(html, 'lxml')
|
335 |
return soup.get_text(separator=' ', strip=True)
|
336 |
|
337 |
+
|
338 |
def extractTextFromUrlList(urls):
|
339 |
with ThreadPoolExecutor() as executor:
|
340 |
texts = list(executor.map(extractTextFromUrl, urls))
|
341 |
+
return '.'.join(texts)
|
requirements.txt
CHANGED
@@ -94,4 +94,5 @@ pandasai
|
|
94 |
easyocr
|
95 |
youtube-transcript-api
|
96 |
pdf2image
|
97 |
-
PyPDF2
|
|
|
|
94 |
easyocr
|
95 |
youtube-transcript-api
|
96 |
pdf2image
|
97 |
+
PyPDF2
|
98 |
+
PyJWT
|