shwetashweta05 commited on
Commit
597e65b
·
verified ·
1 Parent(s): ffe927c

Update pages/6.Data Collection.py

Browse files
Files changed (1) hide show
  1. pages/6.Data Collection.py +0 -85
pages/6.Data Collection.py CHANGED
@@ -73,88 +73,3 @@ if data_type == "Structured":
73
  mime="application/octet-stream",
74
  )
75
 
76
- # CSV Format Section
77
- elif format_selected == "CSV":
78
- st.write("#### CSV Format")
79
-
80
- # Part (a) What it is
81
- st.subheader("What is CSV?")
82
- st.write("""
83
- CSV (Comma-Separated Values) is a lightweight text file format for structured data,
84
- where values are separated by commas. It is widely used for data exchange between systems.
85
- """)
86
-
87
- # Part (b) How to read these files
88
- st.subheader("How to Read CSV Files?")
89
- st.code("""
90
- import pandas as pd
91
- # Read a CSV file
92
- df = pd.read_csv("file.csv")
93
- print(df.head())
94
- """)
95
-
96
- # Part (c) Issues encountered
97
- st.subheader("Common Issues Encountered When Handling CSV Files")
98
- st.write("""
99
- - **Misaligned Rows**: Extra or missing delimiters can lead to misaligned rows.
100
- - **Encoding Problems**: Non-standard characters may cause encoding errors.
101
- - **Large Files**: Processing large CSV files can be resource-intensive.
102
- """)
103
-
104
- # Part (d) How to overcome these errors/issues
105
- st.subheader("How to Overcome These Issues?")
106
- st.write("""
107
- - **Misaligned Rows**: Use a consistent delimiter and validate the file before processing.
108
- - **Encoding Problems**: Explicitly specify the encoding format, e.g., `encoding='utf-8'`.
109
- - **Large Files**: Process the file in chunks using `pandas` (`chunksize` parameter).
110
- """)
111
-
112
- # Downloadable Guide Button
113
- st.markdown("### Download Coding Guide:")
114
- if st.button("Download CSV Guide"):
115
- # Provide a downloadable file
116
- file_path = "CSV_guide.ipynb" # Ensure this file exists in the app directory
117
- with open(file_path, "rb") as file:
118
- st.download_button(
119
- label="Download CSV Guide",
120
- data=file,
121
- file_name="CSV_guide.ipynb",
122
- mime="application/octet-stream",
123
- )
124
-
125
- # Add similar sections for "Unstructured" and "Semi-Structured" data types as needed.
126
-
127
- {
128
- "cells": [
129
- {
130
- "cell_type": "markdown",
131
- "metadata": {},
132
- "source": "## Excel Data Format\n\n### What is Excel?\nExcel is a tabular data format commonly used in business and analytics, with extensions `.xls` and `.xlsx`."
133
- },
134
- {
135
- "cell_type": "markdown",
136
- "metadata": {},
137
- "source": "### How to Read Excel Files\nUse the `pandas` library to read Excel files:"
138
- },
139
- {
140
- "cell_type": "code",
141
- "execution_count": null,
142
- "metadata": {},
143
- "outputs": [],
144
- "source": "import pandas as pd\n\ndf = pd.read_excel(\"example.xlsx\")\nprint(df.head())"
145
- },
146
- {
147
- "cell_type": "markdown",
148
- "metadata": {},
149
- "source": "### Common Issues\n1. Missing Data\n2. Encoding Problems\n3. File Corruption\n4. Large Files"
150
- },
151
- {
152
- "cell_type": "markdown",
153
- "metadata": {},
154
- "source": "### How to Overcome Issues\n1. Use data imputation methods for missing data.\n2. Specify encoding when reading files (`encoding='utf-8'`).\n3. Repair or convert corrupted files.\n4. Process large files in chunks with `pandas`."
155
- }
156
- ],
157
- "metadata": {},
158
- "nbformat": 4,
159
- "nbformat_minor": 2
160
- }
 
73
  mime="application/octet-stream",
74
  )
75