DexterSptizu's picture
Create app.py
aacc0d9 verified
raw
history blame
8.39 kB
import streamlit as st
from langchain_community.vectorstores import FAISS, Chroma, Pinecone
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
import tempfile
import os
import torch
# Initialize session state variables
if 'vectorstore' not in st.session_state:
st.session_state.vectorstore = None
if 'documents' not in st.session_state:
st.session_state.documents = None
st.set_page_config(page_title="πŸ—ƒοΈ Vector Store Explorer", layout="wide")
st.title("πŸ—ƒοΈ Vector Store Explorer")
st.markdown("""
Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings,
and perform semantic search!
""")
# Main tabs
main_tab1, main_tab2, main_tab3 = st.tabs(["πŸ“š Document Processing", "πŸ” Vector Store Operations", "πŸ“– Learning Center"])
with main_tab1:
col1, col2 = st.columns([1, 1])
with col1:
st.header("Document Upload")
file_type = st.selectbox("Select File Type", ["Text", "PDF"])
uploaded_file = st.file_uploader(
"Upload your document",
type=["txt", "pdf"],
help="Upload a document to create vector embeddings"
)
if uploaded_file:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path)
st.session_state.documents = loader.load()
st.success("Document loaded successfully!")
# Clean up temp file
os.unlink(tmp_file_path)
except Exception as e:
st.error(f"Error loading document: {str(e)}")
with col2:
st.header("Text Processing")
if st.session_state.documents:
chunk_size = st.slider("Chunk Size", 100, 2000, 500)
chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50)
if st.button("Process Text"):
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
st.session_state.splits = text_splitter.split_documents(st.session_state.documents)
st.success(f"Created {len(st.session_state.splits)} text chunks!")
with st.expander("Preview Chunks"):
for i, chunk in enumerate(st.session_state.splits[:3]):
st.markdown(f"**Chunk {i+1}**")
st.write(chunk.page_content)
st.markdown("---")
with main_tab2:
if 'splits' in st.session_state:
col1, col2 = st.columns([1, 1])
with col1:
st.header("Vector Store Configuration")
vectorstore_type = st.selectbox(
"Select Vector Store",
["FAISS", "Chroma"],
help="Choose the vector store implementation"
)
embedding_type = st.selectbox(
"Select Embeddings",
["OpenAI", "HuggingFace"],
help="Choose the embedding model"
)
if embedding_type == "OpenAI":
api_key = st.text_input("OpenAI API Key", type="password")
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
embeddings = OpenAIEmbeddings()
else:
model_name = st.selectbox(
"Select HuggingFace Model",
["sentence-transformers/all-mpnet-base-v2",
"sentence-transformers/all-MiniLM-L6-v2"]
)
embeddings = HuggingFaceEmbeddings(model_name=model_name)
if st.button("Create Vector Store"):
try:
with st.spinner("Creating vector store..."):
if vectorstore_type == "FAISS":
st.session_state.vectorstore = FAISS.from_documents(
st.session_state.splits,
embeddings
)
else:
st.session_state.vectorstore = Chroma.from_documents(
st.session_state.splits,
embeddings
)
st.success("Vector store created successfully!")
except Exception as e:
st.error(f"Error creating vector store: {str(e)}")
with col2:
st.header("Semantic Search")
if st.session_state.vectorstore:
query = st.text_input("Enter your search query")
k = st.slider("Number of results", 1, 10, 3)
if query:
with st.spinner("Searching..."):
results = st.session_state.vectorstore.similarity_search(query, k=k)
st.subheader("Search Results")
for i, doc in enumerate(results):
with st.expander(f"Result {i+1}"):
st.write(doc.page_content)
st.markdown("**Metadata:**")
st.json(doc.metadata)
with main_tab3:
learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"])
with learn_tab1:
st.markdown("""
### What are Vector Stores?
Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable:
- Semantic search capabilities
- Similarity matching
- Efficient nearest neighbor search
### Available Vector Stores
| Store | Description | Best For |
|-------|-------------|----------|
| FAISS | In-memory, efficient similarity search | Local development, small-medium datasets |
| Chroma | Simple, persistent vector store | Local development, getting started |
| Pinecone | Managed vector database service | Production, large-scale deployments |
""")
with learn_tab2:
st.markdown("""
### Understanding Embeddings
Embeddings are numerical representations of text that capture semantic meaning. They:
- Convert text to dense vectors
- Enable semantic similarity comparison
- Form the basis for vector search
### Embedding Models
- **OpenAI**: High quality, but requires API key and costs money
- **HuggingFace**: Free, open-source alternatives
- all-mpnet-base-v2: High quality, slower
- all-MiniLM-L6-v2: Good quality, faster
""")
with learn_tab3:
st.markdown("""
### Vector Store Best Practices
1. **Chunk Size Selection**
- Smaller chunks for precise retrieval
- Larger chunks for more context
2. **Embedding Model Selection**
- Consider cost vs. quality tradeoff
- Test different models for your use case
3. **Performance Optimization**
- Use appropriate batch sizes
- Consider hardware limitations
- Monitor memory usage
4. **Search Optimization**
- Experiment with different k values
- Use metadata filtering when available
- Consider hybrid search approaches
""")
# Sidebar
st.sidebar.header("πŸ“‹ Instructions")
st.sidebar.markdown("""
1. **Upload Document**
- Select file type
- Upload your document
- Process into chunks
2. **Create Vector Store**
- Choose vector store type
- Select embedding model
- Configure settings
3. **Search**
- Enter search query
- Adjust number of results
- Explore similar documents
""")