File size: 3,922 Bytes
d0fbfa7
 
 
 
 
 
 
20d4430
d0fbfa7
 
3c4744f
d0fbfa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d42007c
d0fbfa7
6c96db3
d0fbfa7
52b8ef2
 
6c96db3
d0fbfa7
 
3c4744f
52b8ef2
e721350
d0fbfa7
 
2588872
 
e721350
d0fbfa7
2588872
 
e721350
d0fbfa7
8967645
d0fbfa7
 
 
 
8967645
 
d0fbfa7
8967645
d0fbfa7
 
8967645
d0fbfa7
 
 
 
 
 
 
 
 
 
 
 
 
8967645
d0fbfa7
e721350
 
 
d0fbfa7
 
8967645
d0fbfa7
 
e721350
d0fbfa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from langchain.document_loaders import PyPDFLoader, PDFMinerLoader, DirectoryLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from os.path import join
import os
from dotenv import load_dotenv
# load_dotenv(r'C:\Users\sksha\Desktop\llm-assignment-master\llm-assignment-master\llm-assignment-master_\backend\.env')
openai_api_key = os.environ.get('OPENAI_API_KEY')
from langchain.document_loaders import TextLoader, PDFMinerLoader, UnstructuredWordDocumentLoader, CSVLoader
from langchain_community.document_loaders import PyMuPDFLoader,TextLoader,CSVLoader,Docx2txtLoader,UnstructuredWordDocumentLoader

# def load_documents(file_path):
#     if file_path.endswith('.txt'):
#         loader = TextLoader(file_path)
#     elif file_path.endswith('.pdf'):
#         loader = PyPDFLoader(file_path)
#     elif file_path.endswith('.doc') or file_path.endswith('.docx'):
#         loader = UnstructuredWordDocumentLoader(file_path)
#     elif file_path.endswith('.csv'):
#         loader = CSVLoader(file_path)
#     else:
#         raise ValueError(f"Unsupported file format: {file_path}")

#     documents = loader.load()
#     return documents
from fastapi import UploadFile
from typing import List
import fitz  # PyMuPDF
import pandas as pd
import docx
import tempfile
from langchain.docstore.document import Document

def read_pdf(file_path: str) -> str:
    loader=PyMuPDFLoader(file_path)
    text=loader.load()
    return text

def read_docx(file_path: str) -> str:
    loader=UnstructuredWordDocumentLoader(file_path)
    text=loader.load()
    return text

def read_csv(file_path: str) -> str:
    loader=CSVLoader(file_path)
    data=loader.load()
    return data
def read_txt(file_path: str) -> str:
    loader=TextLoader(file_path)
    text=loader.load()
    return text

async def load_documents(file: UploadFile) -> List[Document]:
    temp_file_path = f"temp_{file.filename}"
    try:
        # Save the uploaded file to a temporary file
        with open(temp_file_path, "wb") as temp_file:
            contents = await file.read()  # Read the content of the uploaded file
            temp_file.write(contents)  # Write the content to the temporary file

        # Now you can pass temp_file_path to your read functions
        content = ""
        if file.filename.endswith('.pdf'):
            content = read_pdf(temp_file_path)  # Pass the path, not the file object
        elif file.filename.endswith('.docx'):
            content = read_docx(temp_file_path)
        elif file.filename.endswith('.csv'):
            content = read_csv(temp_file_path)
        elif file.filename.endswith('.txt'):
            content = read_txt(temp_file_path)
        else:
            raise ValueError("Unsupported file format")
    except Exception as e:
        print(f"Error processing document: {e}")
        content = "Error processing document."
    finally:
        if os.path.exists(temp_file_path):
            os.remove(temp_file_path)  # Clean up the temporary file

    # metadata = {'source': file.filename}
    # document = Document(page_content=content, metadata=metadata)
    return content



from langchain.text_splitter import CharacterTextSplitter

def chunk_documents(documents, chunk_size, chunk_overlap):
    text_splitter = CharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunked_docs = text_splitter.split_documents(documents)
    return chunked_docs


from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

def create_embeddings(chunked_docs, collection_name):
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    vector_store = Chroma.from_documents(chunked_docs, embeddings, collection_name=collection_name)
    vector_store.persist()

    return vector_store