File size: 5,873 Bytes
df9559b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import streamlit as st
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
import tempfile
import pandas as pd
st.set_page_config(page_title="π¦ LangChain Document Loader Explorer", layout="wide")
st.title("π¦ LangChain Document Loader Explorer")
st.markdown("""
This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them!
""")
# Create main tabs
tab1, tab2, tab3 = st.tabs(["π Document Loading", "βοΈ Processing", "π Learning Center"])
with tab1:
# Document Loading Section
st.header("Document Loader")
doc_type = st.selectbox(
"Select Document Type",
["Text File", "CSV File", "PDF File", "Web Page"],
help="Choose the type of document you want to load"
)
if doc_type == "Web Page":
url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/")
if url:
try:
loader = WebBaseLoader(url)
docs = loader.load()
st.success("Webpage loaded successfully!")
except Exception as e:
st.error(f"Error loading webpage: {str(e)}")
else:
uploaded_file = st.file_uploader("Upload your document", type={
"Text File": ["txt"],
"CSV File": ["csv"],
"PDF File": ["pdf"]
}[doc_type])
if uploaded_file:
with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file:
tmp_file.write(uploaded_file.getvalue())
tmp_file_path = tmp_file.name
try:
if doc_type == "Text File":
loader = TextLoader(tmp_file_path)
elif doc_type == "CSV File":
loader = CSVLoader(tmp_file_path)
elif doc_type == "PDF File":
loader = PyPDFLoader(tmp_file_path)
docs = loader.load()
st.success("Document loaded successfully!")
except Exception as e:
st.error(f"Error loading document: {str(e)}")
with tab2:
# Processing Section
st.header("Document Processing")
if 'docs' in locals():
# Create processing subtabs
proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"])
with proc_tab1:
st.subheader("Text Splitting Configuration")
chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100)
chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50)
process_button = st.button("Process Document")
with proc_tab2:
if process_button:
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(docs)
st.subheader("Processing Summary")
metrics_col1, metrics_col2 = st.columns(2)
with metrics_col1:
st.metric("Number of Documents", len(docs))
with metrics_col2:
st.metric("Number of Chunks", len(splits))
st.subheader("Document Chunks Preview")
for i, chunk in enumerate(splits[:3]):
with st.expander(f"Chunk {i+1}"):
st.write(chunk.page_content)
else:
st.info("Please load a document in the Document Loading tab first.")
with tab3:
# Learning Center Section
learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"])
with learning_tab1:
st.header("What are Document Loaders?")
st.markdown("""
Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle:
- Reading different file formats (TXT, PDF, CSV, etc.)
- Extracting text content
- Maintaining metadata
- Preprocessing for further operations
""")
with learning_tab2:
st.header("Document Loader Types")
st.markdown("""
| Loader Type | Use Case | Supported Formats |
|------------|-----------|-------------------|
| TextLoader | Plain text files | .txt |
| CSVLoader | Tabular data | .csv |
| PyPDFLoader | PDF documents | .pdf |
| WebBaseLoader | Web pages | URLs |
""")
with learning_tab3:
st.header("Tips and Best Practices")
st.markdown("""
### Chunk Size Optimization
- **Large Chunks**: Better context but may hit token limits
- **Small Chunks**: More manageable but may lose context
### Chunk Overlap Strategy
- **Higher Overlap**: Better context preservation
- **Lower Overlap**: Reduced redundancy
### Performance Considerations
- Monitor memory usage with large files
- Consider batch processing for multiple documents
- Implement error handling for robust applications
""")
# Sidebar with instructions
st.sidebar.markdown("""
### π Quick Start Guide
1. **Document Loading**
- Select document type
- Upload file or enter URL
2. **Processing**
- Configure chunk settings
- Process the document
- Review results
3. **Learning**
- Explore documentation
- Understand best practices
- Learn about different loaders
""")
# Footer
st.sidebar.markdown("---")
st.sidebar.markdown("Made with β€οΈ using LangChain 0.3") |