File size: 8,390 Bytes
aacc0d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import streamlit as st
from langchain_community.vectorstores import FAISS, Chroma, Pinecone
from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader, PyPDFLoader
import tempfile
import os
import torch
# Initialize session state variables
if 'vectorstore' not in st.session_state:
st.session_state.vectorstore = None
if 'documents' not in st.session_state:
st.session_state.documents = None
st.set_page_config(page_title="ποΈ Vector Store Explorer", layout="wide")
st.title("ποΈ Vector Store Explorer")
st.markdown("""
Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings,
and perform semantic search!
""")
# Main tabs
main_tab1, main_tab2, main_tab3 = st.tabs(["π Document Processing", "π Vector Store Operations", "π Learning Center"])
with main_tab1:
col1, col2 = st.columns([1, 1])
with col1:
st.header("Document Upload")
file_type = st.selectbox("Select File Type", ["Text", "PDF"])
uploaded_file = st.file_uploader(
"Upload your document",
type=["txt", "pdf"],
help="Upload a document to create vector embeddings"
)
if uploaded_file:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path)
st.session_state.documents = loader.load()
st.success("Document loaded successfully!")
# Clean up temp file
os.unlink(tmp_file_path)
except Exception as e:
st.error(f"Error loading document: {str(e)}")
with col2:
st.header("Text Processing")
if st.session_state.documents:
chunk_size = st.slider("Chunk Size", 100, 2000, 500)
chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50)
if st.button("Process Text"):
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
st.session_state.splits = text_splitter.split_documents(st.session_state.documents)
st.success(f"Created {len(st.session_state.splits)} text chunks!")
with st.expander("Preview Chunks"):
for i, chunk in enumerate(st.session_state.splits[:3]):
st.markdown(f"**Chunk {i+1}**")
st.write(chunk.page_content)
st.markdown("---")
with main_tab2:
if 'splits' in st.session_state:
col1, col2 = st.columns([1, 1])
with col1:
st.header("Vector Store Configuration")
vectorstore_type = st.selectbox(
"Select Vector Store",
["FAISS", "Chroma"],
help="Choose the vector store implementation"
)
embedding_type = st.selectbox(
"Select Embeddings",
["OpenAI", "HuggingFace"],
help="Choose the embedding model"
)
if embedding_type == "OpenAI":
api_key = st.text_input("OpenAI API Key", type="password")
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
embeddings = OpenAIEmbeddings()
else:
model_name = st.selectbox(
"Select HuggingFace Model",
["sentence-transformers/all-mpnet-base-v2",
"sentence-transformers/all-MiniLM-L6-v2"]
)
embeddings = HuggingFaceEmbeddings(model_name=model_name)
if st.button("Create Vector Store"):
try:
with st.spinner("Creating vector store..."):
if vectorstore_type == "FAISS":
st.session_state.vectorstore = FAISS.from_documents(
st.session_state.splits,
embeddings
)
else:
st.session_state.vectorstore = Chroma.from_documents(
st.session_state.splits,
embeddings
)
st.success("Vector store created successfully!")
except Exception as e:
st.error(f"Error creating vector store: {str(e)}")
with col2:
st.header("Semantic Search")
if st.session_state.vectorstore:
query = st.text_input("Enter your search query")
k = st.slider("Number of results", 1, 10, 3)
if query:
with st.spinner("Searching..."):
results = st.session_state.vectorstore.similarity_search(query, k=k)
st.subheader("Search Results")
for i, doc in enumerate(results):
with st.expander(f"Result {i+1}"):
st.write(doc.page_content)
st.markdown("**Metadata:**")
st.json(doc.metadata)
with main_tab3:
learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"])
with learn_tab1:
st.markdown("""
### What are Vector Stores?
Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable:
- Semantic search capabilities
- Similarity matching
- Efficient nearest neighbor search
### Available Vector Stores
| Store | Description | Best For |
|-------|-------------|----------|
| FAISS | In-memory, efficient similarity search | Local development, small-medium datasets |
| Chroma | Simple, persistent vector store | Local development, getting started |
| Pinecone | Managed vector database service | Production, large-scale deployments |
""")
with learn_tab2:
st.markdown("""
### Understanding Embeddings
Embeddings are numerical representations of text that capture semantic meaning. They:
- Convert text to dense vectors
- Enable semantic similarity comparison
- Form the basis for vector search
### Embedding Models
- **OpenAI**: High quality, but requires API key and costs money
- **HuggingFace**: Free, open-source alternatives
- all-mpnet-base-v2: High quality, slower
- all-MiniLM-L6-v2: Good quality, faster
""")
with learn_tab3:
st.markdown("""
### Vector Store Best Practices
1. **Chunk Size Selection**
- Smaller chunks for precise retrieval
- Larger chunks for more context
2. **Embedding Model Selection**
- Consider cost vs. quality tradeoff
- Test different models for your use case
3. **Performance Optimization**
- Use appropriate batch sizes
- Consider hardware limitations
- Monitor memory usage
4. **Search Optimization**
- Experiment with different k values
- Use metadata filtering when available
- Consider hybrid search approaches
""")
# Sidebar
st.sidebar.header("π Instructions")
st.sidebar.markdown("""
1. **Upload Document**
- Select file type
- Upload your document
- Process into chunks
2. **Create Vector Store**
- Choose vector store type
- Select embedding model
- Configure settings
3. **Search**
- Enter search query
- Adjust number of results
- Explore similar documents
""")
|