Spaces:

techconspartners
/

ConversAI

Sleeping

Rauhan commited on Aug 15, 2024

Commit

1945ea8

1 Parent(s): 17050fe

DEBUG: pdfminer

Files changed (2) hide show

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import io
 import os
 from functions import *
-from PyPDF2 import PdfReader
 import pandas as pd
 from fastapi import FastAPI, File, UploadFile
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
-from langchain_community.document_loaders import UnstructuredURLLoader
 from src.api.speech_api import speech_translator_router
 from functions import client as supabase
 from urllib.parse import urlparse
@@ -153,10 +153,12 @@ async def newChatbot(chatbotName: str, username: str):
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
-    reader = PdfReader(io.BytesIO(pdf))
-    text = ""
-    for page in reader.pages:
-        text += page.extract_text()
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
     currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]

 import io
 import os
+import tempfile
 from functions import *
+from langchain_community.document_loaders import PDFMinerLoader
 import pandas as pd
 from fastapi import FastAPI, File, UploadFile
 from pydantic import BaseModel
 from fastapi.middleware.cors import CORSMiddleware
+from langchain_community.document_loaders import WebBaseLoader
 from src.api.speech_api import speech_translator_router
 from functions import client as supabase
 from urllib.parse import urlparse
 async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
     source = pdf.filename
     pdf = await pdf.read()
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
+        temp_file.write(pdf)
+        temp_file_path = temp_file.name
+        loader = PDFMinerLoader(file_path = temp_file_path, concatenate_pages = True)
+    text = loader.load()[0].page_content
+    os.remove(temp_file_path)
     username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
     df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
     currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]

requirements.txt CHANGED Viewed

@@ -70,6 +70,8 @@ websockets==12.0
 bs4
 huggingface-hub
 fastembed-gpu
 flashrank
 langchain
 langchain-community
@@ -78,7 +80,7 @@ langchain-huggingface
 langchain-qdrant
 langchain-groq
 lxml
-PyPDF2
 python-dotenv
 pillow
 pandas

 bs4
 huggingface-hub
 fastembed-gpu
+nest_asyncio
+beautifulsoup4
 flashrank
 langchain
 langchain-community
 langchain-qdrant
 langchain-groq
 lxml
+pdfminer.six
 python-dotenv
 pillow
 pandas