Spaces:

DexterSptizu
/

langchain-vector-stores

Running

App Files Files Community

langchain-vector-stores / app.py

DexterSptizu

Create app.py

aacc0d9 verified about 23 hours ago

raw

history blame

8.39 kB

	import streamlit as st
	from langchain_community.vectorstores import FAISS, Chroma, Pinecone
	from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.document_loaders import TextLoader, PyPDFLoader
	import tempfile
	import os
	import torch

	# Initialize session state variables
	if 'vectorstore' not in st.session_state:
	st.session_state.vectorstore = None
	if 'documents' not in st.session_state:
	st.session_state.documents = None

	st.set_page_config(page_title="🗃️ Vector Store Explorer", layout="wide")
	st.title("🗃️ Vector Store Explorer")
	st.markdown("""
	Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings,
	and perform semantic search!
	""")

	# Main tabs
	main_tab1, main_tab2, main_tab3 = st.tabs(["📚 Document Processing", "🔍 Vector Store Operations", "📖 Learning Center"])

	with main_tab1:
	col1, col2 = st.columns([1, 1])

	with col1:
	st.header("Document Upload")
	file_type = st.selectbox("Select File Type", ["Text", "PDF"])
	uploaded_file = st.file_uploader(
	"Upload your document",
	type=["txt", "pdf"],
	help="Upload a document to create vector embeddings"
	)

	if uploaded_file:
	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_file_path = tmp_file.name

	loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path)
	st.session_state.documents = loader.load()
	st.success("Document loaded successfully!")

	# Clean up temp file
	os.unlink(tmp_file_path)
	except Exception as e:
	st.error(f"Error loading document: {str(e)}")

	with col2:
	st.header("Text Processing")
	if st.session_state.documents:
	chunk_size = st.slider("Chunk Size", 100, 2000, 500)
	chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50)

	if st.button("Process Text"):
	text_splitter = CharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	st.session_state.splits = text_splitter.split_documents(st.session_state.documents)
	st.success(f"Created {len(st.session_state.splits)} text chunks!")

	with st.expander("Preview Chunks"):
	for i, chunk in enumerate(st.session_state.splits[:3]):
	st.markdown(f"Chunk {i+1}")
	st.write(chunk.page_content)
	st.markdown("---")

	with main_tab2:
	if 'splits' in st.session_state:
	col1, col2 = st.columns([1, 1])

	with col1:
	st.header("Vector Store Configuration")

	vectorstore_type = st.selectbox(
	"Select Vector Store",
	["FAISS", "Chroma"],
	help="Choose the vector store implementation"
	)

	embedding_type = st.selectbox(
	"Select Embeddings",
	["OpenAI", "HuggingFace"],
	help="Choose the embedding model"
	)

	if embedding_type == "OpenAI":
	api_key = st.text_input("OpenAI API Key", type="password")
	if api_key:
	os.environ["OPENAI_API_KEY"] = api_key
	embeddings = OpenAIEmbeddings()
	else:
	model_name = st.selectbox(
	"Select HuggingFace Model",
	["sentence-transformers/all-mpnet-base-v2",
	"sentence-transformers/all-MiniLM-L6-v2"]
	)
	embeddings = HuggingFaceEmbeddings(model_name=model_name)

	if st.button("Create Vector Store"):
	try:
	with st.spinner("Creating vector store..."):
	if vectorstore_type == "FAISS":
	st.session_state.vectorstore = FAISS.from_documents(
	st.session_state.splits,
	embeddings
	)
	else:
	st.session_state.vectorstore = Chroma.from_documents(
	st.session_state.splits,
	embeddings
	)
	st.success("Vector store created successfully!")
	except Exception as e:
	st.error(f"Error creating vector store: {str(e)}")

	with col2:
	st.header("Semantic Search")
	if st.session_state.vectorstore:
	query = st.text_input("Enter your search query")
	k = st.slider("Number of results", 1, 10, 3)

	if query:
	with st.spinner("Searching..."):
	results = st.session_state.vectorstore.similarity_search(query, k=k)

	st.subheader("Search Results")
	for i, doc in enumerate(results):
	with st.expander(f"Result {i+1}"):
	st.write(doc.page_content)
	st.markdown("Metadata:")
	st.json(doc.metadata)

	with main_tab3:
	learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"])

	with learn_tab1:
	st.markdown("""
	### What are Vector Stores?

	Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable:
	- Semantic search capabilities
	- Similarity matching
	- Efficient nearest neighbor search

	### Available Vector Stores
	\| Store \| Description \| Best For \|
	\|-------\|-------------\|----------\|
	\| FAISS \| In-memory, efficient similarity search \| Local development, small-medium datasets \|
	\| Chroma \| Simple, persistent vector store \| Local development, getting started \|
	\| Pinecone \| Managed vector database service \| Production, large-scale deployments \|
	""")

	with learn_tab2:
	st.markdown("""
	### Understanding Embeddings

	Embeddings are numerical representations of text that capture semantic meaning. They:
	- Convert text to dense vectors
	- Enable semantic similarity comparison
	- Form the basis for vector search

	### Embedding Models
	- OpenAI: High quality, but requires API key and costs money
	- HuggingFace: Free, open-source alternatives
	- all-mpnet-base-v2: High quality, slower
	- all-MiniLM-L6-v2: Good quality, faster
	""")

	with learn_tab3:
	st.markdown("""
	### Vector Store Best Practices

	1. Chunk Size Selection
	- Smaller chunks for precise retrieval
	- Larger chunks for more context

	2. Embedding Model Selection
	- Consider cost vs. quality tradeoff
	- Test different models for your use case

	3. Performance Optimization
	- Use appropriate batch sizes
	- Consider hardware limitations
	- Monitor memory usage

	4. Search Optimization
	- Experiment with different k values
	- Use metadata filtering when available
	- Consider hybrid search approaches
	""")

	# Sidebar
	st.sidebar.header("📋 Instructions")
	st.sidebar.markdown("""
	1. Upload Document
	- Select file type
	- Upload your document
	- Process into chunks

	2. Create Vector Store
	- Choose vector store type
	- Select embedding model
	- Configure settings

	3. Search
	- Enter search query
	- Adjust number of results
	- Explore similar documents
	""")