import streamlit as st from langchain_community.vectorstores import FAISS, Chroma, Pinecone from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings from langchain.text_splitter import CharacterTextSplitter from langchain_community.document_loaders import TextLoader, PyPDFLoader import tempfile import os import torch # Initialize session state variables if 'vectorstore' not in st.session_state: st.session_state.vectorstore = None if 'documents' not in st.session_state: st.session_state.documents = None st.set_page_config(page_title="🗃️ Vector Store Explorer", layout="wide") st.title("🗃️ Vector Store Explorer") st.markdown(""" Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings, and perform semantic search! """) # Main tabs main_tab1, main_tab2, main_tab3 = st.tabs(["📚 Document Processing", "🔍 Vector Store Operations", "📖 Learning Center"]) with main_tab1: col1, col2 = st.columns([1, 1]) with col1: st.header("Document Upload") file_type = st.selectbox("Select File Type", ["Text", "PDF"]) uploaded_file = st.file_uploader( "Upload your document", type=["txt", "pdf"], help="Upload a document to create vector embeddings" ) if uploaded_file: try: with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path) st.session_state.documents = loader.load() st.success("Document loaded successfully!") # Clean up temp file os.unlink(tmp_file_path) except Exception as e: st.error(f"Error loading document: {str(e)}") with col2: st.header("Text Processing") if st.session_state.documents: chunk_size = st.slider("Chunk Size", 100, 2000, 500) chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50) if st.button("Process Text"): text_splitter = CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) st.session_state.splits = text_splitter.split_documents(st.session_state.documents) st.success(f"Created {len(st.session_state.splits)} text chunks!") with st.expander("Preview Chunks"): for i, chunk in enumerate(st.session_state.splits[:3]): st.markdown(f"**Chunk {i+1}**") st.write(chunk.page_content) st.markdown("---") with main_tab2: if 'splits' in st.session_state: col1, col2 = st.columns([1, 1]) with col1: st.header("Vector Store Configuration") vectorstore_type = st.selectbox( "Select Vector Store", ["FAISS", "Chroma"], help="Choose the vector store implementation" ) embedding_type = st.selectbox( "Select Embeddings", ["OpenAI", "HuggingFace"], help="Choose the embedding model" ) if embedding_type == "OpenAI": api_key = st.text_input("OpenAI API Key", type="password") if api_key: os.environ["OPENAI_API_KEY"] = api_key embeddings = OpenAIEmbeddings() else: model_name = st.selectbox( "Select HuggingFace Model", ["sentence-transformers/all-mpnet-base-v2", "sentence-transformers/all-MiniLM-L6-v2"] ) embeddings = HuggingFaceEmbeddings(model_name=model_name) if st.button("Create Vector Store"): try: with st.spinner("Creating vector store..."): if vectorstore_type == "FAISS": st.session_state.vectorstore = FAISS.from_documents( st.session_state.splits, embeddings ) else: st.session_state.vectorstore = Chroma.from_documents( st.session_state.splits, embeddings ) st.success("Vector store created successfully!") except Exception as e: st.error(f"Error creating vector store: {str(e)}") with col2: st.header("Semantic Search") if st.session_state.vectorstore: query = st.text_input("Enter your search query") k = st.slider("Number of results", 1, 10, 3) if query: with st.spinner("Searching..."): results = st.session_state.vectorstore.similarity_search(query, k=k) st.subheader("Search Results") for i, doc in enumerate(results): with st.expander(f"Result {i+1}"): st.write(doc.page_content) st.markdown("**Metadata:**") st.json(doc.metadata) with main_tab3: learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"]) with learn_tab1: st.markdown(""" ### What are Vector Stores? Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable: - Semantic search capabilities - Similarity matching - Efficient nearest neighbor search ### Available Vector Stores | Store | Description | Best For | |-------|-------------|----------| | FAISS | In-memory, efficient similarity search | Local development, small-medium datasets | | Chroma | Simple, persistent vector store | Local development, getting started | | Pinecone | Managed vector database service | Production, large-scale deployments | """) with learn_tab2: st.markdown(""" ### Understanding Embeddings Embeddings are numerical representations of text that capture semantic meaning. They: - Convert text to dense vectors - Enable semantic similarity comparison - Form the basis for vector search ### Embedding Models - **OpenAI**: High quality, but requires API key and costs money - **HuggingFace**: Free, open-source alternatives - all-mpnet-base-v2: High quality, slower - all-MiniLM-L6-v2: Good quality, faster """) with learn_tab3: st.markdown(""" ### Vector Store Best Practices 1. **Chunk Size Selection** - Smaller chunks for precise retrieval - Larger chunks for more context 2. **Embedding Model Selection** - Consider cost vs. quality tradeoff - Test different models for your use case 3. **Performance Optimization** - Use appropriate batch sizes - Consider hardware limitations - Monitor memory usage 4. **Search Optimization** - Experiment with different k values - Use metadata filtering when available - Consider hybrid search approaches """) # Sidebar st.sidebar.header("📋 Instructions") st.sidebar.markdown(""" 1. **Upload Document** - Select file type - Upload your document - Process into chunks 2. **Create Vector Store** - Choose vector store type - Select embedding model - Configure settings 3. **Search** - Enter search query - Adjust number of results - Explore similar documents """)