fastapiapp / file_processing.py
Sk4467's picture
Update file_processing.py
3c4744f verified
from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from os.path import join
import os
from dotenv import load_dotenv
# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
openai_api_key = os.environ.get('OPENAI_API_KEY')
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader,UnstructuredWordDocumentLoader
# def load_documents(file_path):
# if file_path.endswith('.txt'):
# loader = TextLoader(file_path)
# elif file_path.endswith('.pdf'):
# loader = PyPDFLoader(file_path)
# elif file_path.endswith('.doc') or file_path.endswith('.docx'):
# loader = UnstructuredWordDocumentLoader(file_path)
# elif file_path.endswith('.csv'):
# loader = CSVLoader(file_path)
# else:
# raise ValueError(f"Unsupported file format: {file_path}")
# documents = loader.load()
# return documents
from fastapi import UploadFile
from typing import List
import fitz # PyMuPDF
import pandas as pd
import docx
import tempfile
from langchain.docstore.document import Document
def read_pdf(file_path: str) -> str:
loader=PyMuPDFLoader(file_path)
text=loader.load()
return text
def read_docx(file_path: str) -> str:
loader=UnstructuredWordDocumentLoader(file_path)
text=loader.load()
return text
def read_csv(file_path: str) -> str:
loader=CSVLoader(file_path)
data=loader.load()
return data
def read_txt(file_path: str) -> str:
loader=TextLoader(file_path)
text=loader.load()
return text
async def load_documents(file: UploadFile) -> List[Document]:
temp_file_path = f"temp_{file.filename}"
try:
# Save the uploaded file to a temporary file
with open(temp_file_path, "wb") as temp_file:
contents = await file.read() # Read the content of the uploaded file
temp_file.write(contents) # Write the content to the temporary file
# Now you can pass temp_file_path to your read functions
content = ""
if file.filename.endswith('.pdf'):
content = read_pdf(temp_file_path) # Pass the path, not the file object
elif file.filename.endswith('.docx'):
content = read_docx(temp_file_path)
elif file.filename.endswith('.csv'):
content = read_csv(temp_file_path)
elif file.filename.endswith('.txt'):
content = read_txt(temp_file_path)
else:
raise ValueError("Unsupported file format")
except Exception as e:
print(f"Error processing document: {e}")
content = "Error processing document."
finally:
if os.path.exists(temp_file_path):
os.remove(temp_file_path) # Clean up the temporary file
# metadata = {'source': file.filename}
# document = Document(page_content=content, metadata=metadata)
return content
from langchain.text_splitter import CharacterTextSplitter
def chunk_documents(documents, chunk_size, chunk_overlap):
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunked_docs = text_splitter.split_documents(documents)
return chunked_docs
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
def create_embeddings(chunked_docs, collection_name):
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
vector_store.persist()
return vector_store