Spaces:

DexterSptizu
/

langchain-document-loader

Sleeping

App Files Files Community

DexterSptizu commited on Nov 1, 2024

Commit

df9559b

verified ·

1 Parent(s): de35f7c

Create app.py

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import streamlit as st
+from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader
+from langchain.text_splitter import CharacterTextSplitter
+import tempfile
+import pandas as pd
+st.set_page_config(page_title="🦜 LangChain Document Loader Explorer", layout="wide")
+st.title("🦜 LangChain Document Loader Explorer")
+st.markdown("""
+This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them!
+""")
+# Create main tabs
+tab1, tab2, tab3 = st.tabs(["📄 Document Loading", "⚙️ Processing", "📚 Learning Center"])
+with tab1:
+    # Document Loading Section
+    st.header("Document Loader")
+    doc_type = st.selectbox(
+        "Select Document Type",
+        ["Text File", "CSV File", "PDF File", "Web Page"],
+        help="Choose the type of document you want to load"
+    )
+    if doc_type == "Web Page":
+        url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/")
+        if url:
+            try:
+                loader = WebBaseLoader(url)
+                docs = loader.load()
+                st.success("Webpage loaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading webpage: {str(e)}")
+    else:
+        uploaded_file = st.file_uploader("Upload your document", type={
+            "Text File": ["txt"],
+            "CSV File": ["csv"],
+            "PDF File": ["pdf"]
+        }[doc_type])
+        if uploaded_file:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file:
+                tmp_file.write(uploaded_file.getvalue())
+                tmp_file_path = tmp_file.name
+            try:
+                if doc_type == "Text File":
+                    loader = TextLoader(tmp_file_path)
+                elif doc_type == "CSV File":
+                    loader = CSVLoader(tmp_file_path)
+                elif doc_type == "PDF File":
+                    loader = PyPDFLoader(tmp_file_path)
+                docs = loader.load()
+                st.success("Document loaded successfully!")
+            except Exception as e:
+                st.error(f"Error loading document: {str(e)}")
+with tab2:
+    # Processing Section
+    st.header("Document Processing")
+    if 'docs' in locals():
+        # Create processing subtabs
+        proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"])
+        with proc_tab1:
+            st.subheader("Text Splitting Configuration")
+            chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100)
+            chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50)
+            process_button = st.button("Process Document")
+        with proc_tab2:
+            if process_button:
+                text_splitter = CharacterTextSplitter(
+                    chunk_size=chunk_size,
+                    chunk_overlap=chunk_overlap
+                )
+                splits = text_splitter.split_documents(docs)
+                st.subheader("Processing Summary")
+                metrics_col1, metrics_col2 = st.columns(2)
+                with metrics_col1:
+                    st.metric("Number of Documents", len(docs))
+                with metrics_col2:
+                    st.metric("Number of Chunks", len(splits))
+                st.subheader("Document Chunks Preview")
+                for i, chunk in enumerate(splits[:3]):
+                    with st.expander(f"Chunk {i+1}"):
+                        st.write(chunk.page_content)
+    else:
+        st.info("Please load a document in the Document Loading tab first.")
+with tab3:
+    # Learning Center Section
+    learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"])
+    with learning_tab1:
+        st.header("What are Document Loaders?")
+        st.markdown("""
+        Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle:
+        - Reading different file formats (TXT, PDF, CSV, etc.)
+        - Extracting text content
+        - Maintaining metadata
+        - Preprocessing for further operations
+        """)
+    with learning_tab2:
+        st.header("Document Loader Types")
+        st.markdown("""
+        | Loader Type | Use Case | Supported Formats |
+        |------------|-----------|-------------------|
+        | TextLoader | Plain text files | .txt |
+        | CSVLoader | Tabular data | .csv |
+        | PyPDFLoader | PDF documents | .pdf |
+        | WebBaseLoader | Web pages | URLs |
+        """)
+    with learning_tab3:
+        st.header("Tips and Best Practices")
+        st.markdown("""
+        ### Chunk Size Optimization
+        - **Large Chunks**: Better context but may hit token limits
+        - **Small Chunks**: More manageable but may lose context
+        ### Chunk Overlap Strategy
+        - **Higher Overlap**: Better context preservation
+        - **Lower Overlap**: Reduced redundancy
+        ### Performance Considerations
+        - Monitor memory usage with large files
+        - Consider batch processing for multiple documents
+        - Implement error handling for robust applications
+        """)
+# Sidebar with instructions
+st.sidebar.markdown("""
+### 🔍 Quick Start Guide
+1. **Document Loading**
+   - Select document type
+   - Upload file or enter URL
+2. **Processing**
+   - Configure chunk settings
+   - Process the document
+   - Review results
+3. **Learning**
+   - Explore documentation
+   - Understand best practices
+   - Learn about different loaders
+""")
+# Footer
+st.sidebar.markdown("---")
+st.sidebar.markdown("Made with ❤️ using LangChain 0.3")