Spaces:

DexterSptizu
/

langchain-document-loader

Sleeping

File size: 5,873 Bytes

df9559b

import streamlit as st
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
import tempfile
import pandas as pd

st.set_page_config(page_title="🦜 LangChain Document Loader Explorer", layout="wide")

st.title("🦜 LangChain Document Loader Explorer")
st.markdown("""
This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them!
""")

# Create main tabs
tab1, tab2, tab3 = st.tabs(["📄 Document Loading", "⚙️ Processing", "📚 Learning Center"])

with tab1:
    # Document Loading Section
    st.header("Document Loader")
    
    doc_type = st.selectbox(
        "Select Document Type",
        ["Text File", "CSV File", "PDF File", "Web Page"],
        help="Choose the type of document you want to load"
    )
    
    if doc_type == "Web Page":
        url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/")
        if url:
            try:
                loader = WebBaseLoader(url)
                docs = loader.load()
                st.success("Webpage loaded successfully!")
            except Exception as e:
                st.error(f"Error loading webpage: {str(e)}")
    else:
        uploaded_file = st.file_uploader("Upload your document", type={
            "Text File": ["txt"],
            "CSV File": ["csv"],
            "PDF File": ["pdf"]
        }[doc_type])
        
        if uploaded_file:
            with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file:
                tmp_file.write(uploaded_file.getvalue())
                tmp_file_path = tmp_file.name
                
            try:
                if doc_type == "Text File":
                    loader = TextLoader(tmp_file_path)
                elif doc_type == "CSV File":
                    loader = CSVLoader(tmp_file_path)
                elif doc_type == "PDF File":
                    loader = PyPDFLoader(tmp_file_path)
                    
                docs = loader.load()
                st.success("Document loaded successfully!")
            except Exception as e:
                st.error(f"Error loading document: {str(e)}")

with tab2:
    # Processing Section
    st.header("Document Processing")
    
    if 'docs' in locals():
        # Create processing subtabs
        proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"])
        
        with proc_tab1:
            st.subheader("Text Splitting Configuration")
            chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100)
            chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50)
            process_button = st.button("Process Document")
        
        with proc_tab2:
            if process_button:
                text_splitter = CharacterTextSplitter(
                    chunk_size=chunk_size,
                    chunk_overlap=chunk_overlap
                )
                
                splits = text_splitter.split_documents(docs)
                
                st.subheader("Processing Summary")
                metrics_col1, metrics_col2 = st.columns(2)
                with metrics_col1:
                    st.metric("Number of Documents", len(docs))
                with metrics_col2:
                    st.metric("Number of Chunks", len(splits))
                
                st.subheader("Document Chunks Preview")
                for i, chunk in enumerate(splits[:3]):
                    with st.expander(f"Chunk {i+1}"):
                        st.write(chunk.page_content)
    else:
        st.info("Please load a document in the Document Loading tab first.")

with tab3:
    # Learning Center Section
    learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"])
    
    with learning_tab1:
        st.header("What are Document Loaders?")
        st.markdown("""
        Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle:
        - Reading different file formats (TXT, PDF, CSV, etc.)
        - Extracting text content
        - Maintaining metadata
        - Preprocessing for further operations
        """)
        
    with learning_tab2:
        st.header("Document Loader Types")
        st.markdown("""
        | Loader Type | Use Case | Supported Formats |
        |------------|-----------|-------------------|
        | TextLoader | Plain text files | .txt |
        | CSVLoader | Tabular data | .csv |
        | PyPDFLoader | PDF documents | .pdf |
        | WebBaseLoader | Web pages | URLs |
        """)
        
    with learning_tab3:
        st.header("Tips and Best Practices")
        st.markdown("""
        ### Chunk Size Optimization
        - **Large Chunks**: Better context but may hit token limits
        - **Small Chunks**: More manageable but may lose context
        
        ### Chunk Overlap Strategy
        - **Higher Overlap**: Better context preservation
        - **Lower Overlap**: Reduced redundancy
        
        ### Performance Considerations
        - Monitor memory usage with large files
        - Consider batch processing for multiple documents
        - Implement error handling for robust applications
        """)

# Sidebar with instructions
st.sidebar.markdown("""
### 🔍 Quick Start Guide

1. **Document Loading**
   - Select document type
   - Upload file or enter URL
   
2. **Processing**
   - Configure chunk settings
   - Process the document
   - Review results
   
3. **Learning**
   - Explore documentation
   - Understand best practices
   - Learn about different loaders
""")

# Footer
st.sidebar.markdown("---")
st.sidebar.markdown("Made with ❤️ using LangChain 0.3")