File size: 2,643 Bytes
acc9eef
 
 
 
 
99694bd
acc9eef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce562bc
 
 
acc9eef
 
 
 
 
 
 
 
99694bd
acc9eef
 
 
 
 
 
 
99694bd
acc9eef
 
 
 
 
99694bd
acc9eef
 
 
99694bd
acc9eef
bfbd754
acc9eef
 
 
 
 
 
99694bd
acc9eef
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma


# stack up loading methods using elif statments for loading PDF, DOCX, TXT, and CSV files into LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    elif extension == '.csv':
        from langchain.document_loaders import CSVLoader
        loader = CSVLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


# chunck your data for embedding
def chunk_data(data, chunk_size=256, chunk_overlap=20):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(data)
    return chunks


# using OpenAIEmbeddings() create your embeddings and save to the Chroma vector store
def create_embeddings(chunks):
    embeddings = OpenAIEmbeddings()
    vector_store = Chroma.from_documents(chunks, embeddings)
    return vector_store

# here where you ask your question, here we use a combination of RetrievalQA and ChatOpenAI but his is not the only way to do this
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    # choose the 3.5 turbo model which is default and set the temperature to 1 which is maximum
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    #VectorStoreRetrieverMemory stores memories in a VectorDB and queries the top-K most "salient" docs every time it is called.
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    answer = chain.run(q)
    return answer


# return the embedding cost (using tiktoken)
def calculate_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    return total_tokens, total_tokens / 1000 * 0.0004