Spaces:

DexterSptizu
/

langchain-vector-stores

Sleeping

File size: 8,390 Bytes

aacc0d9

import streamlit as st
from langchain_community.vectorstores import FAISS, Chroma, Pinecone
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
import tempfile
import os
import torch

# Initialize session state variables
if 'vectorstore' not in st.session_state:
    st.session_state.vectorstore = None
if 'documents' not in st.session_state:
    st.session_state.documents = None

st.set_page_config(page_title="🗃️ Vector Store Explorer", layout="wide")
st.title("🗃️ Vector Store Explorer")
st.markdown("""
Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings, 
and perform semantic search!
""")

# Main tabs
main_tab1, main_tab2, main_tab3 = st.tabs(["📚 Document Processing", "🔍 Vector Store Operations", "📖 Learning Center"])

with main_tab1:
    col1, col2 = st.columns([1, 1])
    
    with col1:
        st.header("Document Upload")
        file_type = st.selectbox("Select File Type", ["Text", "PDF"])
        uploaded_file = st.file_uploader(
            "Upload your document",
            type=["txt", "pdf"],
            help="Upload a document to create vector embeddings"
        )
        
        if uploaded_file:
            try:
                with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file:
                    tmp_file.write(uploaded_file.getvalue())
                    tmp_file_path = tmp_file.name
                
                loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path)
                st.session_state.documents = loader.load()
                st.success("Document loaded successfully!")
                
                # Clean up temp file
                os.unlink(tmp_file_path)
            except Exception as e:
                st.error(f"Error loading document: {str(e)}")
    
    with col2:
        st.header("Text Processing")
        if st.session_state.documents:
            chunk_size = st.slider("Chunk Size", 100, 2000, 500)
            chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50)
            
            if st.button("Process Text"):
                text_splitter = CharacterTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_overlap
                )
                st.session_state.splits = text_splitter.split_documents(st.session_state.documents)
                st.success(f"Created {len(st.session_state.splits)} text chunks!")
                
                with st.expander("Preview Chunks"):
                    for i, chunk in enumerate(st.session_state.splits[:3]):
                        st.markdown(f"**Chunk {i+1}**")
                        st.write(chunk.page_content)
                        st.markdown("---")

with main_tab2:
    if 'splits' in st.session_state:
        col1, col2 = st.columns([1, 1])
        
        with col1:
            st.header("Vector Store Configuration")
            
            vectorstore_type = st.selectbox(
                "Select Vector Store",
                ["FAISS", "Chroma"],
                help="Choose the vector store implementation"
            )
            
            embedding_type = st.selectbox(
                "Select Embeddings",
                ["OpenAI", "HuggingFace"],
                help="Choose the embedding model"
            )
            
            if embedding_type == "OpenAI":
                api_key = st.text_input("OpenAI API Key", type="password")
                if api_key:
                    os.environ["OPENAI_API_KEY"] = api_key
                    embeddings = OpenAIEmbeddings()
            else:
                model_name = st.selectbox(
                    "Select HuggingFace Model",
                    ["sentence-transformers/all-mpnet-base-v2", 
                     "sentence-transformers/all-MiniLM-L6-v2"]
                )
                embeddings = HuggingFaceEmbeddings(model_name=model_name)
            
            if st.button("Create Vector Store"):
                try:
                    with st.spinner("Creating vector store..."):
                        if vectorstore_type == "FAISS":
                            st.session_state.vectorstore = FAISS.from_documents(
                                st.session_state.splits,
                                embeddings
                            )
                        else:
                            st.session_state.vectorstore = Chroma.from_documents(
                                st.session_state.splits,
                                embeddings
                            )
                    st.success("Vector store created successfully!")
                except Exception as e:
                    st.error(f"Error creating vector store: {str(e)}")
        
        with col2:
            st.header("Semantic Search")
            if st.session_state.vectorstore:
                query = st.text_input("Enter your search query")
                k = st.slider("Number of results", 1, 10, 3)
                
                if query:
                    with st.spinner("Searching..."):
                        results = st.session_state.vectorstore.similarity_search(query, k=k)
                        
                        st.subheader("Search Results")
                        for i, doc in enumerate(results):
                            with st.expander(f"Result {i+1}"):
                                st.write(doc.page_content)
                                st.markdown("**Metadata:**")
                                st.json(doc.metadata)

with main_tab3:
    learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"])
    
    with learn_tab1:
        st.markdown("""
        ### What are Vector Stores?
        
        Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable:
        - Semantic search capabilities
        - Similarity matching
        - Efficient nearest neighbor search
        
        ### Available Vector Stores
        | Store | Description | Best For |
        |-------|-------------|----------|
        | FAISS | In-memory, efficient similarity search | Local development, small-medium datasets |
        | Chroma | Simple, persistent vector store | Local development, getting started |
        | Pinecone | Managed vector database service | Production, large-scale deployments |
        """)
    
    with learn_tab2:
        st.markdown("""
        ### Understanding Embeddings
        
        Embeddings are numerical representations of text that capture semantic meaning. They:
        - Convert text to dense vectors
        - Enable semantic similarity comparison
        - Form the basis for vector search
        
        ### Embedding Models
        - **OpenAI**: High quality, but requires API key and costs money
        - **HuggingFace**: Free, open-source alternatives
            - all-mpnet-base-v2: High quality, slower
            - all-MiniLM-L6-v2: Good quality, faster
        """)
    
    with learn_tab3:
        st.markdown("""
        ### Vector Store Best Practices
        
        1. **Chunk Size Selection**
           - Smaller chunks for precise retrieval
           - Larger chunks for more context
        
        2. **Embedding Model Selection**
           - Consider cost vs. quality tradeoff
           - Test different models for your use case
        
        3. **Performance Optimization**
           - Use appropriate batch sizes
           - Consider hardware limitations
           - Monitor memory usage
        
        4. **Search Optimization**
           - Experiment with different k values
           - Use metadata filtering when available
           - Consider hybrid search approaches
        """)

# Sidebar
st.sidebar.header("📋 Instructions")
st.sidebar.markdown("""
1. **Upload Document**
   - Select file type
   - Upload your document
   - Process into chunks

2. **Create Vector Store**
   - Choose vector store type
   - Select embedding model
   - Configure settings

3. **Search**
   - Enter search query
   - Adjust number of results
   - Explore similar documents
""")