allow pdfs
Browse files
app.py
CHANGED
@@ -52,16 +52,33 @@ text_splitter = CharacterTextSplitter()
|
|
52 |
|
53 |
def process_text_file(file: AskFileResponse):
|
54 |
import tempfile
|
|
|
|
|
|
|
|
|
55 |
|
56 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
|
57 |
temp_file_path = temp_file.name
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
return texts
|
66 |
|
67 |
|
@@ -72,8 +89,8 @@ async def on_chat_start():
|
|
72 |
# Wait for the user to upload a file
|
73 |
while files == None:
|
74 |
files = await cl.AskFileMessage(
|
75 |
-
content="Please upload a .txt or .pdf
|
76 |
-
accept=["text/plain"],
|
77 |
max_size_mb=2,
|
78 |
timeout=180,
|
79 |
).send()
|
|
|
52 |
|
53 |
def process_text_file(file: AskFileResponse):
|
54 |
import tempfile
|
55 |
+
import fitz
|
56 |
+
import os
|
57 |
+
|
58 |
+
file_extension = os.path.splitext(file.name)[1].lower()
|
59 |
|
60 |
with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as temp_file:
|
61 |
temp_file_path = temp_file.name
|
62 |
|
63 |
+
if file_extension == ".txt":
|
64 |
+
with open(temp_file_path, "wb") as f:
|
65 |
+
f.write(file.content)
|
66 |
+
|
67 |
+
text_loader = TextFileLoader(temp_file_path)
|
68 |
+
documents = text_loader.load_documents()
|
69 |
+
texts = text_splitter.split_texts(documents)
|
70 |
+
|
71 |
+
elif file_extension == ".pdf":
|
72 |
+
pdf_document = fitz.open(temp_file_path)
|
73 |
+
documents = []
|
74 |
+
for page_num in range(len(pdf_document)):
|
75 |
+
page = pdf_document.load_page(page_num)
|
76 |
+
text = page.get_text()
|
77 |
+
documents.append(text)
|
78 |
+
texts = text_splitter.split_texts(documents)
|
79 |
+
else:
|
80 |
+
raise ValueError("Unsupported file type")
|
81 |
+
|
82 |
return texts
|
83 |
|
84 |
|
|
|
89 |
# Wait for the user to upload a file
|
90 |
while files == None:
|
91 |
files = await cl.AskFileMessage(
|
92 |
+
content="Please upload a .txt or .pdf file to begin!",
|
93 |
+
accept=["text/plain", "pdf"],
|
94 |
max_size_mb=2,
|
95 |
timeout=180,
|
96 |
).send()
|