Spaces:
Sleeping
Sleeping
File size: 3,922 Bytes
d0fbfa7 20d4430 d0fbfa7 3c4744f d0fbfa7 d42007c d0fbfa7 6c96db3 d0fbfa7 52b8ef2 6c96db3 d0fbfa7 3c4744f 52b8ef2 e721350 d0fbfa7 2588872 e721350 d0fbfa7 2588872 e721350 d0fbfa7 8967645 d0fbfa7 8967645 d0fbfa7 8967645 d0fbfa7 8967645 d0fbfa7 8967645 d0fbfa7 e721350 d0fbfa7 8967645 d0fbfa7 e721350 d0fbfa7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from os.path import join
import os
from dotenv import load_dotenv
# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
openai_api_key = os.environ.get('OPENAI_API_KEY')
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader,UnstructuredWordDocumentLoader
# def load_documents(file_path):
# if file_path.endswith('.txt'):
# loader = TextLoader(file_path)
# elif file_path.endswith('.pdf'):
# loader = PyPDFLoader(file_path)
# elif file_path.endswith('.doc') or file_path.endswith('.docx'):
# loader = UnstructuredWordDocumentLoader(file_path)
# elif file_path.endswith('.csv'):
# loader = CSVLoader(file_path)
# else:
# raise ValueError(f"Unsupported file format: {file_path}")
# documents = loader.load()
# return documents
from fastapi import UploadFile
from typing import List
import fitz # PyMuPDF
import pandas as pd
import docx
import tempfile
from langchain.docstore.document import Document
def read_pdf(file_path: str) -> str:
loader=PyMuPDFLoader(file_path)
text=loader.load()
return text
def read_docx(file_path: str) -> str:
loader=UnstructuredWordDocumentLoader(file_path)
text=loader.load()
return text
def read_csv(file_path: str) -> str:
loader=CSVLoader(file_path)
data=loader.load()
return data
def read_txt(file_path: str) -> str:
loader=TextLoader(file_path)
text=loader.load()
return text
async def load_documents(file: UploadFile) -> List[Document]:
temp_file_path = f"temp_{file.filename}"
try:
# Save the uploaded file to a temporary file
with open(temp_file_path, "wb") as temp_file:
contents = await file.read() # Read the content of the uploaded file
temp_file.write(contents) # Write the content to the temporary file
# Now you can pass temp_file_path to your read functions
content = ""
if file.filename.endswith('.pdf'):
content = read_pdf(temp_file_path) # Pass the path, not the file object
elif file.filename.endswith('.docx'):
content = read_docx(temp_file_path)
elif file.filename.endswith('.csv'):
content = read_csv(temp_file_path)
elif file.filename.endswith('.txt'):
content = read_txt(temp_file_path)
else:
raise ValueError("Unsupported file format")
except Exception as e:
print(f"Error processing document: {e}")
content = "Error processing document."
finally:
if os.path.exists(temp_file_path):
os.remove(temp_file_path) # Clean up the temporary file
# metadata = {'source': file.filename}
# document = Document(page_content=content, metadata=metadata)
return content
from langchain.text_splitter import CharacterTextSplitter
def chunk_documents(documents, chunk_size, chunk_overlap):
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
chunked_docs = text_splitter.split_documents(documents)
return chunked_docs
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
def create_embeddings(chunked_docs, collection_name):
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
vector_store.persist()
return vector_store |