Spaces:

brianknowsai
/

collection-manager

Running

App Files Files Community

marcellopoliti commited on Mar 28

Commit

e04dd70

•

1 Parent(s): dd3a3a4

Add application file

Browse files

Files changed (16) hide show

.gitignore +160 -0
README.md +5 -11
app.py +47 -0
conda.yml +22 -0
generate_kb.py +93 -0
pages/create_knowledge_box.py +34 -0
pages/delete_knowledge_box⚠️.py +16 -0
pages/manage_knowledge_box.py +77 -0
requirements.txt +16 -0
retrieve_kb.py +34 -0
services/document_manager/document_loader.py +83 -0
services/embedding_manager/embedding_manager.py +19 -0
services/vectordb_manager/vectordb_manager.py +163 -0
test2.csv +3 -0
test_marcello.csv +3 -0
utils.py +31 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

README.md CHANGED Viewed

@@ -1,12 +1,6 @@
----
-title: Brian Knows Collections
-emoji: 🔥
-colorFrom: red
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.32.2
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# brian-knows-streamlit
+UI for kb editing
+todo:
+store spltis with urls : check typescript repo

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import streamlit as st
+from utils import get_chroma_client, get_embedding_function
+# streamlit_app.py
+import hmac
+import streamlit as st
+__import__("pysqlite3")
+import sys
+sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+st.set_page_config(page_title="Hello", page_icon="👋", layout="wide")
+def check_password():
+    """Returns `True` if the user had the correct password."""
+    def password_entered():
+        """Checks whether a password entered by the user is correct."""
+        if hmac.compare_digest(st.session_state["password"], st.secrets["password"]):
+            st.session_state["password_correct"] = True
+            del st.session_state["password"]  # Don't store the password.
+        else:
+            st.session_state["password_correct"] = False
+    # Return True if the password is validated.
+    if st.session_state.get("password_correct", False):
+        return True
+    # Show input for password.
+    st.text_input(
+        "Password", type="password", on_change=password_entered, key="password"
+    )
+    if "password_correct" in st.session_state:
+        st.error("😕 Password incorrect")
+    return False
+if not check_password():
+    st.stop()  # Do not continue if check_password is not True.
+# Main Streamlit app starts here
+st.write("# Brian Knowledge Base System! 👋")
+client = get_chroma_client()
+default_embedding_function = get_embedding_function()

conda.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+name: brian_knows
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.11.4
+  - pip
+  - pip:
+      - langchain>=0.1.6
+      - openai==1.14.2
+      - beautifulsoup4==4.12.2
+      - tiktoken==0.5.1
+      - chromadb>=0.4.22
+      - pandas==2.1.1
+      - streamlit==1.27.2
+      - python-dotenv==1.0.0
+      - fastapi==0.104.0
+      - uvicorn==0.23.2
+      - pypdf==3.16.4
+      - python-multipart==0.0.6
+      - matplotlib==3.8.3
+      - umap-learn==0.5.5

generate_kb.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from fastapi import APIRouter
+from services.document_manager.document_loader import DocumentsLoader
+# from services.vectordb_manager.vectordb_manager import VectordbManager
+import pandas as pd
+import chromadb
+from chromadb.config import Settings
+import chromadb.utils.embedding_functions as embedding_functions
+from dotenv import load_dotenv
+import os
+import requests
+from bs4 import BeautifulSoup
+from fastapi import FastAPI, File, UploadFile
+import re
+from main import client, default_embedding_function
+import secrets
+load_dotenv()
+openai_key = os.getenv("OPENAI_API_KEY")
+def generate_knowledge_box_from_url(
+    client,
+    kb_name: str,
+    urls: list,
+    embedding_fct=default_embedding_function,
+    chunk_size: int = 2_000,
+):
+    dl = DocumentsLoader()
+    docs = dl.load_docs(urls)
+    splits = dl.split_docs(docs, chunk_size=chunk_size)
+    contents = [split.page_content for split in splits]
+    metadatas = [split.metadata for split in splits]
+    cleaned_contents = [
+        re.sub(r"\n+", " ", content) for content in contents
+    ]  # clean text a bit
+    chroma_collection = client.create_collection(
+        kb_name,
+        embedding_function=embedding_fct,
+        metadata={"hnsw:space": "cosine"},
+    )
+    ids = [secrets.token_hex(16) for _ in cleaned_contents]
+    chroma_collection.add(documents=cleaned_contents, ids=ids, metadatas=metadatas)
+    n_splits = chroma_collection.count()
+    return {"status": 200, "n_split": n_splits}
+def add_links_to_knowledge_base(
+    client,
+    kb_name: str,
+    urls: list,
+    chunk_size: int = 2_000,
+    embedding_fct=default_embedding_function,
+):
+    dl = DocumentsLoader()
+    docs = dl.load_docs(urls)
+    splits = dl.split_docs(docs, chunk_size=chunk_size)
+    contents = [split.page_content for split in splits]
+    metadatas = [split.metadata for split in splits]
+    cleaned_contents = [
+        re.sub(r"\n+", " ", content) for content in contents
+    ]  # clean text a bit
+    embeddings = default_embedding_function(cleaned_contents)
+    chroma_collection = client.get_collection(name=kb_name)
+    ids = [secrets.token_hex(16) for _ in cleaned_contents]
+    chroma_collection.add(
+        documents=cleaned_contents, embeddings=embeddings, ids=ids, metadatas=metadatas
+    )
+    n_splits = chroma_collection.count()
+    return {"status": 200, "n_split": n_splits}
+if __name__ == "__main__":
+    df = pd.read_csv("test_marcello.csv")
+    kb_name = "new_new_test"
+    urls = df.values.tolist()
+    # res = generate_knowledge_box_from_url(
+    #     client=client,
+    #     urls=urls,
+    #     kb_name=kb_name,
+    #     embedding_fct=default_embedding_function,
+    #     chunk_size=2_000,
+    # )
+    df = pd.read_csv("test2.csv")
+    urls = df.values.tolist()
+    res = add_links_to_knowledge_base(
+        client=client,
+        kb_name="test",
+        urls=urls,
+    )

pages/create_knowledge_box.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from main import client, default_embedding_function
+import pandas as pd
+from generate_kb import generate_knowledge_box_from_url
+# Title of the app
+st.title("Create a knowledge box from CSV file")
+# File uploader widget
+uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
+df = None
+if uploaded_file is not None:
+    try:
+        df = pd.read_csv(uploaded_file)
+        st.write("DataFrame:")
+        st.write(df)
+    except Exception as e:
+        st.error(str(e))
+if uploaded_file is not None:
+    st.text("dont use spaces but underscores _ in your new name")
+    kb_name = st.text_input(label="new knowledge base name")
+    if st.button("Generate new knowledge box"):
+        urls = df.values.tolist()
+        res = generate_knowledge_box_from_url(
+            client=client,
+            urls=urls,
+            kb_name=kb_name,
+            embedding_fct=default_embedding_function,
+            chunk_size=2_000,
+        )
+        st.json(res)

pages/delete_knowledge_box⚠️.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import streamlit as st
+from retrieve_kb import get_current_knowledge_bases
+from main import client
+st.title("Delete knowledge Base ☠️")
+st.title("Get knowledge boxes")
+if st.button("Get current knowledge bases"):
+    kbs = get_current_knowledge_bases(client=client)
+    st.json(kbs)
+collection_name = st.text_input(label="collection name")
+if st.button("Delete Forever"):
+    client.delete_collection(collection_name)
+    st.success("Deleted")

pages/manage_knowledge_box.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import streamlit as st
+from retrieve_kb import get_current_knowledge_bases, get_knowledge_base_information
+from generate_kb import add_links_to_knowledge_base
+from main import client, default_embedding_function
+import pandas as pd
+st.title("Get knowledge boxes")
+if st.button("Get current knowledge bases"):
+    kbs = get_current_knowledge_bases(client=client)
+    st.json(kbs)
+collection_name = st.text_input(label="knowledge base name")
+info = {}
+collection = None
+if "df" not in st.session_state:
+    st.session_state["df"] = pd.DataFrame()
+col1, col2 = st.columns(2)
+if st.button("Get All"):
+    collection_info, coll = get_knowledge_base_information(
+        client=client,
+        embedding_function=default_embedding_function,
+        kb_name=collection_name,
+    )
+    st.session_state["collection"] = coll
+    collection = coll
+    # st.write(collection_info)
+    df = pd.DataFrame.from_records(collection_info)
+    df["source"] = df["metadatas"].apply(lambda x: x.get("source", "unkown"))
+    df["title"] = df["metadatas"].apply(lambda x: x.get("title", "unkown"))
+    df = df[["documents", "source", "title", "ids"]]
+    st.session_state["df"] = df
+if len(st.session_state["df"]) != 0:
+    st.dataframe(st.session_state["df"], width=3_000)
+    unique_df = st.session_state["df"]["source"].unique()
+    st.text(f"unique urls:  {len(unique_df)}")
+    st.dataframe(unique_df)
+st.header("Remove a split")
+id = st.text_input("Insert a split id")
+if st.button("Remove Id from collection"):
+    if id in st.session_state["df"]["ids"].values.tolist():
+        res = st.session_state["collection"].delete(ids=[f"id"])
+        st.success(f"id {id} deleted")
+    else:
+        st.error(f"id {id} not in kb")
+st.header("Add url to existing collection")
+url_text = st.text_input("Insert a url link")
+if st.button("add url to collection"):
+    urls = [url_text]  # put in a list even if only one
+    res = add_links_to_knowledge_base(client=client, kb_name=collection_name, urls=urls)
+    st.write(res)
+st.header("Add csv to existing collection")
+uploaded_file = st.file_uploader("Choose a CSV file", type=["csv"])
+df = None
+if uploaded_file is not None:
+    try:
+        new_df = pd.read_csv(uploaded_file)
+        st.write("DataFrame:")
+        st.write(new_df)
+    except Exception as e:
+        st.error(str(e))
+    if st.button("add csv urls to collection"):
+        urls = new_df.values.tolist()
+        st.write(urls)
+        res = add_links_to_knowledge_base(
+            client=client, kb_name=collection_name, urls=urls
+        )
+        st.write(res)

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+tiktoken>=0.5.1
+pysqlite3-binary
+langchain>=0.1.6
+#sqlite3>=3.35.0
+chromadb>=0.4.22
+openai==1.14.2
+beautifulsoup4==4.12.2
+pandas>=2.1.1
+streamlit>=1.27.2
+python-dotenv==1.0.0
+fastapi>=0.104.0
+uvicorn>=0.23.2
+#pypdf==3.16.4
+#python-multipart==0.0.6
+#matplotlib==3.8.3
+#umap-learn==0.5.5

retrieve_kb.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from fastapi import APIRouter
+import chromadb
+from chromadb.config import Settings
+from utils import get_chroma_client, get_embedding_function
+router = APIRouter()
+default_embedding_function = get_embedding_function()
+def get_current_knowledge_bases(client):
+    knowledge_boxes = client.list_collections()
+    return knowledge_boxes
+def get_knowledge_base_information(
+    client, kb_name: str, embedding_function=default_embedding_function
+):
+    collection = client.get_collection(
+        name=kb_name, embedding_function=embedding_function
+    )
+    collection_info = collection.get(
+        include=["documents", "metadatas"]
+    )  # you can add "embeddings", "metadatas",
+    return collection_info, collection
+if __name__ == "__main__":
+    client = get_chroma_client()
+    knowledge_boxes = get_current_knowledge_bases(client=client)
+    for kb in knowledge_boxes:
+        print(kb.name)

services/document_manager/document_loader.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import re
+from langchain.document_loaders import PyPDFLoader
+import pandas as pd
+from langchain.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+import requests
+class DocumentsLoader:
+    def __init__(self) -> None:
+        pass
+    def load_urls_from_csv(self, url_path: str, column: str = "url"):
+        df = pd.read_csv(url_path)
+        doc_urls = df[column].to_list()
+        return doc_urls
+    def is_notion_url(self, url):
+        # Regular expressions to match Notion URLs
+        notion_regex = r"https://(www\.)?(notion\.so|notion\.site)/"
+        # Check if the URL matches the Notion regex
+        return re.match(notion_regex, url) is not None
+    def is_pdf_url(self, url):
+        # Define a list of common PDF file extensions
+        pdf_extensions = [".pdf"]
+        # Check if the URL ends with a PDF file extension
+        for extension in pdf_extensions:
+            if url.endswith(extension):
+                return True
+        return False
+    def is_valid_url(self, url):
+        # TODO: handle status codes not 200
+        try:
+            response = requests.head(url)
+            if response.status_code == 200:
+                return True  # 200 status code indicates a valid URL
+        except requests.RequestException:
+            return False
+    def load_docs(self, doc_urls: list) -> list:
+        web_urls, pdf_urls, docs = [], [], []
+        if isinstance(doc_urls[0], list):
+            doc_urls = [doc[0] for doc in doc_urls]
+            # doc_urls = doc_urls[0]
+        for url in doc_urls:
+            if self.is_pdf_url(url):
+                pdf_urls.append(url)
+            else:
+                web_urls.append(url)
+        if len(web_urls) > 0:
+            web_urls = [url for url in web_urls if self.is_valid_url(url)]
+            for web_url in web_urls:
+                try:
+                    web_loader = WebBaseLoader(web_url)
+                    web_docs = web_loader.load()
+                    docs = docs + web_docs
+                except Exception as e:
+                    print(f"Error web loader, {web_url}: {str(e)}")
+        if len(pdf_urls) > 0:
+            pdf_urls = [url for url in pdf_urls if self.is_valid_url(url)]
+            for pdf_url in pdf_urls:
+                try:
+                    pdf_loader = PyPDFLoader(pdf_url)
+                    pdf_docs = pdf_loader.load()
+                    docs = docs + pdf_docs
+                except Exception as e:
+                    print(f"Error pdf loader, {pdf_url}: {str(e)}")
+        return docs
+    def split_docs(self, docs, chunk_size=2000):
+        r_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=0,
+            separators=["\n\n", "\n", "\. ", " ", ""],
+        )
+        splits = r_splitter.split_documents(docs)
+        return splits

services/embedding_manager/embedding_manager.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import numpy as np
+from langchain.embeddings.openai import OpenAIEmbeddings
+from utils import ModelName
+class EmbeddingManager:
+    def __init__(self, model_name=ModelName.OPENAI) -> None:
+        self.model_name = model_name
+    def compare_embeddigns_similarity(self, embedding_1, embedding_2):
+        similarity = np.dot(embedding_1, embedding_2)
+        return similarity
+    def generate_embeddings(self, splits: list[str]):
+        embedding = None
+        if self.model_name == ModelName.OPENAI:
+            embedding = OpenAIEmbeddings()
+        embeddings = [embedding.embed_query(split) for split in splits]
+        return embeddings

services/vectordb_manager/vectordb_manager.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from langchain.embeddings import OpenAIEmbeddings
+# from langchain_community.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from src.utils import brian_knows_system_message
+from uuid import uuid4
+import chromadb
+from chromadb.config import Settings
+from chromadb.utils import embedding_functions
+import sys
+import os
+import openai
+import logging
+sys.path.append("../..")
+from dotenv import load_dotenv, find_dotenv
+_ = load_dotenv(find_dotenv())  # read local .env file
+openai.api_key = os.environ["OPENAI_API_KEY"]
+class VectordbManager:
+    def __init__(
+        self,
+        knowledge_base_name: str,
+    ) -> None:
+        self.knowledge_base_name = knowledge_base_name
+        self.vector_db = None
+    def load_vectordb(
+        self,
+        embedding_function=OpenAIEmbeddings(),
+    ):
+        client = chromadb.HttpClient(
+            host="chroma.brianknows.org",
+            port="443",
+            ssl=True,
+            settings=Settings(allow_reset=True),
+        )
+        vectordb = Chroma(embedding_function=embedding_function, client=client)
+        self.vector_db = vectordb
+    def load_collection(self, embedding_function=OpenAIEmbeddings()):
+        client = chromadb.HttpClient(
+            host="chroma.brianknows.org",
+            port=443,
+            ssl=True,
+            settings=Settings(
+                allow_reset=True,
+            ),
+        )
+        collection = client.get_collection(
+            self.knowledge_base_name,
+            embedding_function=embedding_functions.OpenAIEmbeddingFunction(
+                api_key=os.environ["OPENAI_API_KEY"]
+            ),
+        )
+        return collection
+    def create_vector_db(self, splits: list, knowledge_base_name: str):
+        logging.info("create_vector_db")
+        embedding_fn = OpenAIEmbeddings()
+        try:
+            client = chromadb.HttpClient(
+                host="chroma.brianknows.org",
+                port=443,
+                ssl=True,
+                settings=Settings(
+                    allow_reset=True,
+                ),
+            )
+            collection = client.get_or_create_collection(
+                knowledge_base_name,
+                embedding_function=embedding_functions.OpenAIEmbeddingFunction(
+                    api_key=os.environ["OPENAI_API_KEY"]
+                ),
+            )
+            ids = []
+            metadatas = []
+            documents = []
+            for split in splits:
+                ids.append(str(uuid4()))
+                metadatas.append(split.metadata)
+                documents.append(split.page_content)
+            collection.add(documents=documents, ids=ids, metadatas=metadatas)
+            vector_db = Chroma.from_documents(
+                documents=splits, embedding=embedding_fn, client=client
+            )
+            self.vector_db = vector_db
+        except Exception as e:
+            logging.error(f"error in creating db: {str(e)}")
+    def add_splits_to_existing_vectordb(
+        self,
+        splits: list,
+    ):
+        for split in splits:
+            try:
+                self.vector_db.add_documents([split])
+                print("document loaded!")
+            except Exception as e:
+                print(f"Error with doc : {split}")
+                print(e)
+    def retrieve_docs_from_query(self, query: str, k=2, fetch_k=3) -> list:
+        """
+        query : Text to look up documents similar to.
+        k : Number of Documents to return. Defaults to 4.
+        fetch_k : Number of Documents to fetch to pass to MMR algorithm.
+        lambda_mult : Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
+        """
+        retrieved_docs = self.vector_db.max_marginal_relevance_search(
+            query, k=k, fetch_k=fetch_k
+        )
+        return retrieved_docs
+    def retrieve_qa(
+        self,
+        llm,
+        query: str,
+        score_threshold: float = 0.65,
+        system_message=brian_knows_system_message,
+    ):
+        """return llm answer based on docs"""
+        # Build prompt
+        template = """You are a Web3 assistant. Use the following pieces of context to answer the question at \
+            the end. If you don't know the answer, just say: "I don't know". Don't try to make up an \
+                answer! Provide a always a detailed and comprehensive response. """
+        fixed_template = """ {context}
+        Question: {question}
+        Detailed Answer:"""
+        template = system_message + fixed_template
+        QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
+        # Run chain
+        qa_chain = RetrievalQA.from_chain_type(
+            llm,
+            retriever=self.vector_db.as_retriever(
+                search_type="similarity_score_threshold",
+                search_kwargs={"score_threshold": score_threshold},
+            ),
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
+            # reduce_k_below_max_tokens=True,
+        )
+        result = qa_chain({"query": query})
+        return result

test2.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+url
+https://it.wikipedia.org/wiki/Michael_Jordan
+https://en.wikipedia.org/wiki/Kobe_Bryant

test_marcello.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+url
+https://en.wikipedia.org/wiki/Dragon_Ball
+https://en.wikipedia.org/wiki/Naruto

utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import chromadb
+from chromadb.config import Settings
+import chromadb.utils.embedding_functions as embedding_functions
+from dotenv import load_dotenv
+import os
+load_dotenv()
+openai_key = os.getenv("OPENAI_API_KEY")
+def get_chroma_client(
+    host: str = "chroma.brianknows.org",
+    port: int = 443,
+) -> chromadb.HttpClient:
+    chroma_client = chromadb.HttpClient(
+        host=host,
+        port=443,
+        ssl=port,
+        settings=Settings(
+            allow_reset=True,
+        ),
+    )
+    return chroma_client
+def get_embedding_function(model_name="text-embedding-ada-002"):
+    openai_ef = embedding_functions.OpenAIEmbeddingFunction(
+        api_key=openai_key, model_name=model_name
+    )
+    return openai_ef