DexterSptizu commited on
Commit
df9559b
Β·
verified Β·
1 Parent(s): de35f7c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader, WebBaseLoader
3
+ from langchain.text_splitter import CharacterTextSplitter
4
+ import tempfile
5
+ import pandas as pd
6
+
7
+ st.set_page_config(page_title="🦜 LangChain Document Loader Explorer", layout="wide")
8
+
9
+ st.title("🦜 LangChain Document Loader Explorer")
10
+ st.markdown("""
11
+ This interactive app demonstrates different document loaders in LangChain. Upload your documents and see how LangChain processes them!
12
+ """)
13
+
14
+ # Create main tabs
15
+ tab1, tab2, tab3 = st.tabs(["πŸ“„ Document Loading", "βš™οΈ Processing", "πŸ“š Learning Center"])
16
+
17
+ with tab1:
18
+ # Document Loading Section
19
+ st.header("Document Loader")
20
+
21
+ doc_type = st.selectbox(
22
+ "Select Document Type",
23
+ ["Text File", "CSV File", "PDF File", "Web Page"],
24
+ help="Choose the type of document you want to load"
25
+ )
26
+
27
+ if doc_type == "Web Page":
28
+ url = st.text_input("Enter webpage URL:", "https://python.langchain.com/docs/")
29
+ if url:
30
+ try:
31
+ loader = WebBaseLoader(url)
32
+ docs = loader.load()
33
+ st.success("Webpage loaded successfully!")
34
+ except Exception as e:
35
+ st.error(f"Error loading webpage: {str(e)}")
36
+ else:
37
+ uploaded_file = st.file_uploader("Upload your document", type={
38
+ "Text File": ["txt"],
39
+ "CSV File": ["csv"],
40
+ "PDF File": ["pdf"]
41
+ }[doc_type])
42
+
43
+ if uploaded_file:
44
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{doc_type.lower().split()[0]}") as tmp_file:
45
+ tmp_file.write(uploaded_file.getvalue())
46
+ tmp_file_path = tmp_file.name
47
+
48
+ try:
49
+ if doc_type == "Text File":
50
+ loader = TextLoader(tmp_file_path)
51
+ elif doc_type == "CSV File":
52
+ loader = CSVLoader(tmp_file_path)
53
+ elif doc_type == "PDF File":
54
+ loader = PyPDFLoader(tmp_file_path)
55
+
56
+ docs = loader.load()
57
+ st.success("Document loaded successfully!")
58
+ except Exception as e:
59
+ st.error(f"Error loading document: {str(e)}")
60
+
61
+ with tab2:
62
+ # Processing Section
63
+ st.header("Document Processing")
64
+
65
+ if 'docs' in locals():
66
+ # Create processing subtabs
67
+ proc_tab1, proc_tab2 = st.tabs(["Settings", "Results"])
68
+
69
+ with proc_tab1:
70
+ st.subheader("Text Splitting Configuration")
71
+ chunk_size = st.slider("Chunk Size", 100, 2000, 1000, 100)
72
+ chunk_overlap = st.slider("Chunk Overlap", 0, 500, 200, 50)
73
+ process_button = st.button("Process Document")
74
+
75
+ with proc_tab2:
76
+ if process_button:
77
+ text_splitter = CharacterTextSplitter(
78
+ chunk_size=chunk_size,
79
+ chunk_overlap=chunk_overlap
80
+ )
81
+
82
+ splits = text_splitter.split_documents(docs)
83
+
84
+ st.subheader("Processing Summary")
85
+ metrics_col1, metrics_col2 = st.columns(2)
86
+ with metrics_col1:
87
+ st.metric("Number of Documents", len(docs))
88
+ with metrics_col2:
89
+ st.metric("Number of Chunks", len(splits))
90
+
91
+ st.subheader("Document Chunks Preview")
92
+ for i, chunk in enumerate(splits[:3]):
93
+ with st.expander(f"Chunk {i+1}"):
94
+ st.write(chunk.page_content)
95
+ else:
96
+ st.info("Please load a document in the Document Loading tab first.")
97
+
98
+ with tab3:
99
+ # Learning Center Section
100
+ learning_tab1, learning_tab2, learning_tab3 = st.tabs(["Overview", "Loader Types", "Best Practices"])
101
+
102
+ with learning_tab1:
103
+ st.header("What are Document Loaders?")
104
+ st.markdown("""
105
+ Document loaders in LangChain are tools that help you load documents from various sources into a format that can be used by language models. They handle:
106
+ - Reading different file formats (TXT, PDF, CSV, etc.)
107
+ - Extracting text content
108
+ - Maintaining metadata
109
+ - Preprocessing for further operations
110
+ """)
111
+
112
+ with learning_tab2:
113
+ st.header("Document Loader Types")
114
+ st.markdown("""
115
+ | Loader Type | Use Case | Supported Formats |
116
+ |------------|-----------|-------------------|
117
+ | TextLoader | Plain text files | .txt |
118
+ | CSVLoader | Tabular data | .csv |
119
+ | PyPDFLoader | PDF documents | .pdf |
120
+ | WebBaseLoader | Web pages | URLs |
121
+ """)
122
+
123
+ with learning_tab3:
124
+ st.header("Tips and Best Practices")
125
+ st.markdown("""
126
+ ### Chunk Size Optimization
127
+ - **Large Chunks**: Better context but may hit token limits
128
+ - **Small Chunks**: More manageable but may lose context
129
+
130
+ ### Chunk Overlap Strategy
131
+ - **Higher Overlap**: Better context preservation
132
+ - **Lower Overlap**: Reduced redundancy
133
+
134
+ ### Performance Considerations
135
+ - Monitor memory usage with large files
136
+ - Consider batch processing for multiple documents
137
+ - Implement error handling for robust applications
138
+ """)
139
+
140
+ # Sidebar with instructions
141
+ st.sidebar.markdown("""
142
+ ### πŸ” Quick Start Guide
143
+
144
+ 1. **Document Loading**
145
+ - Select document type
146
+ - Upload file or enter URL
147
+
148
+ 2. **Processing**
149
+ - Configure chunk settings
150
+ - Process the document
151
+ - Review results
152
+
153
+ 3. **Learning**
154
+ - Explore documentation
155
+ - Understand best practices
156
+ - Learn about different loaders
157
+ """)
158
+
159
+ # Footer
160
+ st.sidebar.markdown("---")
161
+ st.sidebar.markdown("Made with ❀️ using LangChain 0.3")