import streamlit as st from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader from langchain.text_splitter import CharacterTextSplitter import tempfile import pandas as pd st.set_page_config(page_title="🦜 LangChain Document Loader Explorer", layout="wide") st.title("🦜 LangChain Document Loader Explorer") st.markdown(""" This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them! """) # Create main tabs tab1, tab2, tab3 = st.tabs(["📄 Document Loading", "⚙️ Processing", "📚 Learning Center"]) with tab1: # Document Loading Section st.header("Document Loader") doc_type = st.selectbox( "Select Document Type", ["Text File", "CSV File", "PDF File", "Web Page"], help="Choose the type of document you want to load" ) if doc_type == "Web Page": url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/") if url: try: loader = WebBaseLoader(url) docs = loader.load() st.success("Webpage loaded successfully!") except Exception as e: st.error(f"Error loading webpage: {str(e)}") else: uploaded_file = st.file_uploader("Upload your document", type={ "Text File": ["txt"], "CSV File": ["csv"], "PDF File": ["pdf"] }[doc_type]) if uploaded_file: with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file: tmp_file.write(uploaded_file.getvalue()) tmp_file_path = tmp_file.name try: if doc_type == "Text File": loader = TextLoader(tmp_file_path) elif doc_type == "CSV File": loader = CSVLoader(tmp_file_path) elif doc_type == "PDF File": loader = PyPDFLoader(tmp_file_path) docs = loader.load() st.success("Document loaded successfully!") except Exception as e: st.error(f"Error loading document: {str(e)}") with tab2: # Processing Section st.header("Document Processing") if 'docs' in locals(): # Create processing subtabs proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"]) with proc_tab1: st.subheader("Text Splitting Configuration") chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100) chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50) process_button = st.button("Process Document") with proc_tab2: if process_button: text_splitter = CharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap ) splits = text_splitter.split_documents(docs) st.subheader("Processing Summary") metrics_col1, metrics_col2 = st.columns(2) with metrics_col1: st.metric("Number of Documents", len(docs)) with metrics_col2: st.metric("Number of Chunks", len(splits)) st.subheader("Document Chunks Preview") for i, chunk in enumerate(splits[:3]): with st.expander(f"Chunk {i+1}"): st.write(chunk.page_content) else: st.info("Please load a document in the Document Loading tab first.") with tab3: # Learning Center Section learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"]) with learning_tab1: st.header("What are Document Loaders?") st.markdown(""" Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle: - Reading different file formats (TXT, PDF, CSV, etc.) - Extracting text content - Maintaining metadata - Preprocessing for further operations """) with learning_tab2: st.header("Document Loader Types") st.markdown(""" | Loader Type | Use Case | Supported Formats | |------------|-----------|-------------------| | TextLoader | Plain text files | .txt | | CSVLoader | Tabular data | .csv | | PyPDFLoader | PDF documents | .pdf | | WebBaseLoader | Web pages | URLs | """) with learning_tab3: st.header("Tips and Best Practices") st.markdown(""" ### Chunk Size Optimization - **Large Chunks**: Better context but may hit token limits - **Small Chunks**: More manageable but may lose context ### Chunk Overlap Strategy - **Higher Overlap**: Better context preservation - **Lower Overlap**: Reduced redundancy ### Performance Considerations - Monitor memory usage with large files - Consider batch processing for multiple documents - Implement error handling for robust applications """) # Sidebar with instructions st.sidebar.markdown(""" ### 🔍 Quick Start Guide 1. **Document Loading** - Select document type - Upload file or enter URL 2. **Processing** - Configure chunk settings - Process the document - Review results 3. **Learning** - Explore documentation - Understand best practices - Learn about different loaders """) # Footer st.sidebar.markdown("---") st.sidebar.markdown("Made with ❤️ using LangChain 0.3")