Rauhan commited on
Commit
1945ea8
1 Parent(s): 17050fe

DEBUG: pdfminer

Browse files
Files changed (2) hide show
  1. app.py +9 -7
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,13 +1,13 @@
1
  import io
2
  import os
3
-
4
  from functions import *
5
- from PyPDF2 import PdfReader
6
  import pandas as pd
7
  from fastapi import FastAPI, File, UploadFile
8
  from pydantic import BaseModel
9
  from fastapi.middleware.cors import CORSMiddleware
10
- from langchain_community.document_loaders import UnstructuredURLLoader
11
  from src.api.speech_api import speech_translator_router
12
  from functions import client as supabase
13
  from urllib.parse import urlparse
@@ -153,10 +153,12 @@ async def newChatbot(chatbotName: str, username: str):
153
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
154
  source = pdf.filename
155
  pdf = await pdf.read()
156
- reader = PdfReader(io.BytesIO(pdf))
157
- text = ""
158
- for page in reader.pages:
159
- text += page.extract_text()
 
 
160
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
161
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
162
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
 
1
  import io
2
  import os
3
+ import tempfile
4
  from functions import *
5
+ from langchain_community.document_loaders import PDFMinerLoader
6
  import pandas as pd
7
  from fastapi import FastAPI, File, UploadFile
8
  from pydantic import BaseModel
9
  from fastapi.middleware.cors import CORSMiddleware
10
+ from langchain_community.document_loaders import WebBaseLoader
11
  from src.api.speech_api import speech_translator_router
12
  from functions import client as supabase
13
  from urllib.parse import urlparse
 
153
  async def addPDFData(vectorstore: str, pdf: UploadFile = File(...)):
154
  source = pdf.filename
155
  pdf = await pdf.read()
156
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
157
+ temp_file.write(pdf)
158
+ temp_file_path = temp_file.name
159
+ loader = PDFMinerLoader(file_path = temp_file_path, concatenate_pages = True)
160
+ text = loader.load()[0].page_content
161
+ os.remove(temp_file_path)
162
  username, chatbotname = vectorstore.split("$")[1], vectorstore.split("$")[2]
163
  df = pd.DataFrame(client.table("ConversAI_ChatbotInfo").select("*").execute().data)
164
  currentCount = df[(df["user_id"] == username) & (df["chatbotname"] == chatbotname)]["charactercount"].iloc[0]
requirements.txt CHANGED
@@ -70,6 +70,8 @@ websockets==12.0
70
  bs4
71
  huggingface-hub
72
  fastembed-gpu
 
 
73
  flashrank
74
  langchain
75
  langchain-community
@@ -78,7 +80,7 @@ langchain-huggingface
78
  langchain-qdrant
79
  langchain-groq
80
  lxml
81
- PyPDF2
82
  python-dotenv
83
  pillow
84
  pandas
 
70
  bs4
71
  huggingface-hub
72
  fastembed-gpu
73
+ nest_asyncio
74
+ beautifulsoup4
75
  flashrank
76
  langchain
77
  langchain-community
 
80
  langchain-qdrant
81
  langchain-groq
82
  lxml
83
+ pdfminer.six
84
  python-dotenv
85
  pillow
86
  pandas