DexterSptizu's picture
Create app.py
df9559b verified
import streamlit as st
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
import tempfile
import pandas as pd
st.set_page_config(page_title="🦜 LangChain Document Loader Explorer", layout="wide")
st.title("🦜 LangChain Document Loader Explorer")
st.markdown("""
This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them!
""")
# Create main tabs
tab1, tab2, tab3 = st.tabs(["πŸ“„ Document Loading", "βš™οΈ Processing", "πŸ“š Learning Center"])
with tab1:
# Document Loading Section
st.header("Document Loader")
doc_type = st.selectbox(
"Select Document Type",
["Text File", "CSV File", "PDF File", "Web Page"],
help="Choose the type of document you want to load"
)
if doc_type == "Web Page":
url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/")
if url:
try:
loader = WebBaseLoader(url)
docs = loader.load()
st.success("Webpage loaded successfully!")
except Exception as e:
st.error(f"Error loading webpage: {str(e)}")
else:
uploaded_file = st.file_uploader("Upload your document", type={
"Text File": ["txt"],
"CSV File": ["csv"],
"PDF File": ["pdf"]
}[doc_type])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
try:
if doc_type == "Text File":
loader = TextLoader(tmp_file_path)
elif doc_type == "CSV File":
loader = CSVLoader(tmp_file_path)
elif doc_type == "PDF File":
loader = PyPDFLoader(tmp_file_path)
docs = loader.load()
st.success("Document loaded successfully!")
except Exception as e:
st.error(f"Error loading document: {str(e)}")
with tab2:
# Processing Section
st.header("Document Processing")
if 'docs' in locals():
# Create processing subtabs
proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"])
with proc_tab1:
st.subheader("Text Splitting Configuration")
chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100)
chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50)
process_button = st.button("Process Document")
with proc_tab2:
if process_button:
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(docs)
st.subheader("Processing Summary")
metrics_col1, metrics_col2 = st.columns(2)
with metrics_col1:
st.metric("Number of Documents", len(docs))
with metrics_col2:
st.metric("Number of Chunks", len(splits))
st.subheader("Document Chunks Preview")
for i, chunk in enumerate(splits[:3]):
with st.expander(f"Chunk {i+1}"):
st.write(chunk.page_content)
else:
st.info("Please load a document in the Document Loading tab first.")
with tab3:
# Learning Center Section
learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"])
with learning_tab1:
st.header("What are Document Loaders?")
st.markdown("""
Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle:
- Reading different file formats (TXT, PDF, CSV, etc.)
- Extracting text content
- Maintaining metadata
- Preprocessing for further operations
""")
with learning_tab2:
st.header("Document Loader Types")
st.markdown("""
| Loader Type | Use Case | Supported Formats |
|------------|-----------|-------------------|
| TextLoader | Plain text files | .txt |
| CSVLoader | Tabular data | .csv |
| PyPDFLoader | PDF documents | .pdf |
| WebBaseLoader | Web pages | URLs |
""")
with learning_tab3:
st.header("Tips and Best Practices")
st.markdown("""
### Chunk Size Optimization
- **Large Chunks**: Better context but may hit token limits
- **Small Chunks**: More manageable but may lose context
### Chunk Overlap Strategy
- **Higher Overlap**: Better context preservation
- **Lower Overlap**: Reduced redundancy
### Performance Considerations
- Monitor memory usage with large files
- Consider batch processing for multiple documents
- Implement error handling for robust applications
""")
# Sidebar with instructions
st.sidebar.markdown("""
### πŸ” Quick Start Guide
1. **Document Loading**
- Select document type
- Upload file or enter URL
2. **Processing**
- Configure chunk settings
- Process the document
- Review results
3. **Learning**
- Explore documentation
- Understand best practices
- Learn about different loaders
""")
# Footer
st.sidebar.markdown("---")
st.sidebar.markdown("Made with ❀️ using LangChain 0.3")