DexterSptizu commited on
Commit
aacc0d9
β€’
1 Parent(s): 5eadd9a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.vectorstores import FAISS, Chroma, Pinecone
3
+ from langchain_community.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
4
+ from langchain.text_splitter import CharacterTextSplitter
5
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
6
+ import tempfile
7
+ import os
8
+ import torch
9
+
10
+ # Initialize session state variables
11
+ if 'vectorstore' not in st.session_state:
12
+ st.session_state.vectorstore = None
13
+ if 'documents' not in st.session_state:
14
+ st.session_state.documents = None
15
+
16
+ st.set_page_config(page_title="πŸ—ƒοΈ Vector Store Explorer", layout="wide")
17
+ st.title("πŸ—ƒοΈ Vector Store Explorer")
18
+ st.markdown("""
19
+ Explore different vector stores and embeddings in LangChain. Upload documents, create embeddings,
20
+ and perform semantic search!
21
+ """)
22
+
23
+ # Main tabs
24
+ main_tab1, main_tab2, main_tab3 = st.tabs(["πŸ“š Document Processing", "πŸ” Vector Store Operations", "πŸ“– Learning Center"])
25
+
26
+ with main_tab1:
27
+ col1, col2 = st.columns([1, 1])
28
+
29
+ with col1:
30
+ st.header("Document Upload")
31
+ file_type = st.selectbox("Select File Type", ["Text", "PDF"])
32
+ uploaded_file = st.file_uploader(
33
+ "Upload your document",
34
+ type=["txt", "pdf"],
35
+ help="Upload a document to create vector embeddings"
36
+ )
37
+
38
+ if uploaded_file:
39
+ try:
40
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_type.lower()}") as tmp_file:
41
+ tmp_file.write(uploaded_file.getvalue())
42
+ tmp_file_path = tmp_file.name
43
+
44
+ loader = TextLoader(tmp_file_path) if file_type == "Text" else PyPDFLoader(tmp_file_path)
45
+ st.session_state.documents = loader.load()
46
+ st.success("Document loaded successfully!")
47
+
48
+ # Clean up temp file
49
+ os.unlink(tmp_file_path)
50
+ except Exception as e:
51
+ st.error(f"Error loading document: {str(e)}")
52
+
53
+ with col2:
54
+ st.header("Text Processing")
55
+ if st.session_state.documents:
56
+ chunk_size = st.slider("Chunk Size", 100, 2000, 500)
57
+ chunk_overlap = st.slider("Chunk Overlap", 0, 200, 50)
58
+
59
+ if st.button("Process Text"):
60
+ text_splitter = CharacterTextSplitter(
61
+ chunk_size=chunk_size,
62
+ chunk_overlap=chunk_overlap
63
+ )
64
+ st.session_state.splits = text_splitter.split_documents(st.session_state.documents)
65
+ st.success(f"Created {len(st.session_state.splits)} text chunks!")
66
+
67
+ with st.expander("Preview Chunks"):
68
+ for i, chunk in enumerate(st.session_state.splits[:3]):
69
+ st.markdown(f"**Chunk {i+1}**")
70
+ st.write(chunk.page_content)
71
+ st.markdown("---")
72
+
73
+ with main_tab2:
74
+ if 'splits' in st.session_state:
75
+ col1, col2 = st.columns([1, 1])
76
+
77
+ with col1:
78
+ st.header("Vector Store Configuration")
79
+
80
+ vectorstore_type = st.selectbox(
81
+ "Select Vector Store",
82
+ ["FAISS", "Chroma"],
83
+ help="Choose the vector store implementation"
84
+ )
85
+
86
+ embedding_type = st.selectbox(
87
+ "Select Embeddings",
88
+ ["OpenAI", "HuggingFace"],
89
+ help="Choose the embedding model"
90
+ )
91
+
92
+ if embedding_type == "OpenAI":
93
+ api_key = st.text_input("OpenAI API Key", type="password")
94
+ if api_key:
95
+ os.environ["OPENAI_API_KEY"] = api_key
96
+ embeddings = OpenAIEmbeddings()
97
+ else:
98
+ model_name = st.selectbox(
99
+ "Select HuggingFace Model",
100
+ ["sentence-transformers/all-mpnet-base-v2",
101
+ "sentence-transformers/all-MiniLM-L6-v2"]
102
+ )
103
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
104
+
105
+ if st.button("Create Vector Store"):
106
+ try:
107
+ with st.spinner("Creating vector store..."):
108
+ if vectorstore_type == "FAISS":
109
+ st.session_state.vectorstore = FAISS.from_documents(
110
+ st.session_state.splits,
111
+ embeddings
112
+ )
113
+ else:
114
+ st.session_state.vectorstore = Chroma.from_documents(
115
+ st.session_state.splits,
116
+ embeddings
117
+ )
118
+ st.success("Vector store created successfully!")
119
+ except Exception as e:
120
+ st.error(f"Error creating vector store: {str(e)}")
121
+
122
+ with col2:
123
+ st.header("Semantic Search")
124
+ if st.session_state.vectorstore:
125
+ query = st.text_input("Enter your search query")
126
+ k = st.slider("Number of results", 1, 10, 3)
127
+
128
+ if query:
129
+ with st.spinner("Searching..."):
130
+ results = st.session_state.vectorstore.similarity_search(query, k=k)
131
+
132
+ st.subheader("Search Results")
133
+ for i, doc in enumerate(results):
134
+ with st.expander(f"Result {i+1}"):
135
+ st.write(doc.page_content)
136
+ st.markdown("**Metadata:**")
137
+ st.json(doc.metadata)
138
+
139
+ with main_tab3:
140
+ learn_tab1, learn_tab2, learn_tab3 = st.tabs(["Vector Stores", "Embeddings", "Best Practices"])
141
+
142
+ with learn_tab1:
143
+ st.markdown("""
144
+ ### What are Vector Stores?
145
+
146
+ Vector stores are specialized databases that store and retrieve vector embeddings efficiently. They enable:
147
+ - Semantic search capabilities
148
+ - Similarity matching
149
+ - Efficient nearest neighbor search
150
+
151
+ ### Available Vector Stores
152
+ | Store | Description | Best For |
153
+ |-------|-------------|----------|
154
+ | FAISS | In-memory, efficient similarity search | Local development, small-medium datasets |
155
+ | Chroma | Simple, persistent vector store | Local development, getting started |
156
+ | Pinecone | Managed vector database service | Production, large-scale deployments |
157
+ """)
158
+
159
+ with learn_tab2:
160
+ st.markdown("""
161
+ ### Understanding Embeddings
162
+
163
+ Embeddings are numerical representations of text that capture semantic meaning. They:
164
+ - Convert text to dense vectors
165
+ - Enable semantic similarity comparison
166
+ - Form the basis for vector search
167
+
168
+ ### Embedding Models
169
+ - **OpenAI**: High quality, but requires API key and costs money
170
+ - **HuggingFace**: Free, open-source alternatives
171
+ - all-mpnet-base-v2: High quality, slower
172
+ - all-MiniLM-L6-v2: Good quality, faster
173
+ """)
174
+
175
+ with learn_tab3:
176
+ st.markdown("""
177
+ ### Vector Store Best Practices
178
+
179
+ 1. **Chunk Size Selection**
180
+ - Smaller chunks for precise retrieval
181
+ - Larger chunks for more context
182
+
183
+ 2. **Embedding Model Selection**
184
+ - Consider cost vs. quality tradeoff
185
+ - Test different models for your use case
186
+
187
+ 3. **Performance Optimization**
188
+ - Use appropriate batch sizes
189
+ - Consider hardware limitations
190
+ - Monitor memory usage
191
+
192
+ 4. **Search Optimization**
193
+ - Experiment with different k values
194
+ - Use metadata filtering when available
195
+ - Consider hybrid search approaches
196
+ """)
197
+
198
+ # Sidebar
199
+ st.sidebar.header("πŸ“‹ Instructions")
200
+ st.sidebar.markdown("""
201
+ 1. **Upload Document**
202
+ - Select file type
203
+ - Upload your document
204
+ - Process into chunks
205
+
206
+ 2. **Create Vector Store**
207
+ - Choose vector store type
208
+ - Select embedding model
209
+ - Configure settings
210
+
211
+ 3. **Search**
212
+ - Enter search query
213
+ - Adjust number of results
214
+ - Explore similar documents
215
+ """)