|
import streamlit as st |
|
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader |
|
from langchain.text_splitter import CharacterTextSplitter |
|
import tempfile |
|
import pandas as pd |
|
|
|
st.set_page_config(page_title="π¦ LangChain Document Loader Explorer", layout="wide") |
|
|
|
st.title("π¦ LangChain Document Loader Explorer") |
|
st.markdown(""" |
|
This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them! |
|
""") |
|
|
|
|
|
tab1, tab2, tab3 = st.tabs(["π Document Loading", "βοΈ Processing", "π Learning Center"]) |
|
|
|
with tab1: |
|
|
|
st.header("Document Loader") |
|
|
|
doc_type = st.selectbox( |
|
"Select Document Type", |
|
["Text File", "CSV File", "PDF File", "Web Page"], |
|
help="Choose the type of document you want to load" |
|
) |
|
|
|
if doc_type == "Web Page": |
|
url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/") |
|
if url: |
|
try: |
|
loader = WebBaseLoader(url) |
|
docs = loader.load() |
|
st.success("Webpage loaded successfully!") |
|
except Exception as e: |
|
st.error(f"Error loading webpage: {str(e)}") |
|
else: |
|
uploaded_file = st.file_uploader("Upload your document", type={ |
|
"Text File": ["txt"], |
|
"CSV File": ["csv"], |
|
"PDF File": ["pdf"] |
|
}[doc_type]) |
|
|
|
if uploaded_file: |
|
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file: |
|
tmp_file.write(uploaded_file.getvalue()) |
|
tmp_file_path = tmp_file.name |
|
|
|
try: |
|
if doc_type == "Text File": |
|
loader = TextLoader(tmp_file_path) |
|
elif doc_type == "CSV File": |
|
loader = CSVLoader(tmp_file_path) |
|
elif doc_type == "PDF File": |
|
loader = PyPDFLoader(tmp_file_path) |
|
|
|
docs = loader.load() |
|
st.success("Document loaded successfully!") |
|
except Exception as e: |
|
st.error(f"Error loading document: {str(e)}") |
|
|
|
with tab2: |
|
|
|
st.header("Document Processing") |
|
|
|
if 'docs' in locals(): |
|
|
|
proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"]) |
|
|
|
with proc_tab1: |
|
st.subheader("Text Splitting Configuration") |
|
chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100) |
|
chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50) |
|
process_button = st.button("Process Document") |
|
|
|
with proc_tab2: |
|
if process_button: |
|
text_splitter = CharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap |
|
) |
|
|
|
splits = text_splitter.split_documents(docs) |
|
|
|
st.subheader("Processing Summary") |
|
metrics_col1, metrics_col2 = st.columns(2) |
|
with metrics_col1: |
|
st.metric("Number of Documents", len(docs)) |
|
with metrics_col2: |
|
st.metric("Number of Chunks", len(splits)) |
|
|
|
st.subheader("Document Chunks Preview") |
|
for i, chunk in enumerate(splits[:3]): |
|
with st.expander(f"Chunk {i+1}"): |
|
st.write(chunk.page_content) |
|
else: |
|
st.info("Please load a document in the Document Loading tab first.") |
|
|
|
with tab3: |
|
|
|
learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"]) |
|
|
|
with learning_tab1: |
|
st.header("What are Document Loaders?") |
|
st.markdown(""" |
|
Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle: |
|
- Reading different file formats (TXT, PDF, CSV, etc.) |
|
- Extracting text content |
|
- Maintaining metadata |
|
- Preprocessing for further operations |
|
""") |
|
|
|
with learning_tab2: |
|
st.header("Document Loader Types") |
|
st.markdown(""" |
|
| Loader Type | Use Case | Supported Formats | |
|
|------------|-----------|-------------------| |
|
| TextLoader | Plain text files | .txt | |
|
| CSVLoader | Tabular data | .csv | |
|
| PyPDFLoader | PDF documents | .pdf | |
|
| WebBaseLoader | Web pages | URLs | |
|
""") |
|
|
|
with learning_tab3: |
|
st.header("Tips and Best Practices") |
|
st.markdown(""" |
|
### Chunk Size Optimization |
|
- **Large Chunks**: Better context but may hit token limits |
|
- **Small Chunks**: More manageable but may lose context |
|
|
|
### Chunk Overlap Strategy |
|
- **Higher Overlap**: Better context preservation |
|
- **Lower Overlap**: Reduced redundancy |
|
|
|
### Performance Considerations |
|
- Monitor memory usage with large files |
|
- Consider batch processing for multiple documents |
|
- Implement error handling for robust applications |
|
""") |
|
|
|
|
|
st.sidebar.markdown(""" |
|
### π Quick Start Guide |
|
|
|
1. **Document Loading** |
|
- Select document type |
|
- Upload file or enter URL |
|
|
|
2. **Processing** |
|
- Configure chunk settings |
|
- Process the document |
|
- Review results |
|
|
|
3. **Learning** |
|
- Explore documentation |
|
- Understand best practices |
|
- Learn about different loaders |
|
""") |
|
|
|
|
|
st.sidebar.markdown("---") |
|
st.sidebar.markdown("Made with β€οΈ using LangChain 0.3") |