shwetashweta05
commited on
Update pages/6.Data Collection.py
Browse files- pages/6.Data Collection.py +0 -85
pages/6.Data Collection.py
CHANGED
@@ -73,88 +73,3 @@ if data_type == "Structured":
|
|
73 |
mime="application/octet-stream",
|
74 |
)
|
75 |
|
76 |
-
# CSV Format Section
|
77 |
-
elif format_selected == "CSV":
|
78 |
-
st.write("#### CSV Format")
|
79 |
-
|
80 |
-
# Part (a) What it is
|
81 |
-
st.subheader("What is CSV?")
|
82 |
-
st.write("""
|
83 |
-
CSV (Comma-Separated Values) is a lightweight text file format for structured data,
|
84 |
-
where values are separated by commas. It is widely used for data exchange between systems.
|
85 |
-
""")
|
86 |
-
|
87 |
-
# Part (b) How to read these files
|
88 |
-
st.subheader("How to Read CSV Files?")
|
89 |
-
st.code("""
|
90 |
-
import pandas as pd
|
91 |
-
# Read a CSV file
|
92 |
-
df = pd.read_csv("file.csv")
|
93 |
-
print(df.head())
|
94 |
-
""")
|
95 |
-
|
96 |
-
# Part (c) Issues encountered
|
97 |
-
st.subheader("Common Issues Encountered When Handling CSV Files")
|
98 |
-
st.write("""
|
99 |
-
- **Misaligned Rows**: Extra or missing delimiters can lead to misaligned rows.
|
100 |
-
- **Encoding Problems**: Non-standard characters may cause encoding errors.
|
101 |
-
- **Large Files**: Processing large CSV files can be resource-intensive.
|
102 |
-
""")
|
103 |
-
|
104 |
-
# Part (d) How to overcome these errors/issues
|
105 |
-
st.subheader("How to Overcome These Issues?")
|
106 |
-
st.write("""
|
107 |
-
- **Misaligned Rows**: Use a consistent delimiter and validate the file before processing.
|
108 |
-
- **Encoding Problems**: Explicitly specify the encoding format, e.g., `encoding='utf-8'`.
|
109 |
-
- **Large Files**: Process the file in chunks using `pandas` (`chunksize` parameter).
|
110 |
-
""")
|
111 |
-
|
112 |
-
# Downloadable Guide Button
|
113 |
-
st.markdown("### Download Coding Guide:")
|
114 |
-
if st.button("Download CSV Guide"):
|
115 |
-
# Provide a downloadable file
|
116 |
-
file_path = "CSV_guide.ipynb" # Ensure this file exists in the app directory
|
117 |
-
with open(file_path, "rb") as file:
|
118 |
-
st.download_button(
|
119 |
-
label="Download CSV Guide",
|
120 |
-
data=file,
|
121 |
-
file_name="CSV_guide.ipynb",
|
122 |
-
mime="application/octet-stream",
|
123 |
-
)
|
124 |
-
|
125 |
-
# Add similar sections for "Unstructured" and "Semi-Structured" data types as needed.
|
126 |
-
|
127 |
-
{
|
128 |
-
"cells": [
|
129 |
-
{
|
130 |
-
"cell_type": "markdown",
|
131 |
-
"metadata": {},
|
132 |
-
"source": "## Excel Data Format\n\n### What is Excel?\nExcel is a tabular data format commonly used in business and analytics, with extensions `.xls` and `.xlsx`."
|
133 |
-
},
|
134 |
-
{
|
135 |
-
"cell_type": "markdown",
|
136 |
-
"metadata": {},
|
137 |
-
"source": "### How to Read Excel Files\nUse the `pandas` library to read Excel files:"
|
138 |
-
},
|
139 |
-
{
|
140 |
-
"cell_type": "code",
|
141 |
-
"execution_count": null,
|
142 |
-
"metadata": {},
|
143 |
-
"outputs": [],
|
144 |
-
"source": "import pandas as pd\n\ndf = pd.read_excel(\"example.xlsx\")\nprint(df.head())"
|
145 |
-
},
|
146 |
-
{
|
147 |
-
"cell_type": "markdown",
|
148 |
-
"metadata": {},
|
149 |
-
"source": "### Common Issues\n1. Missing Data\n2. Encoding Problems\n3. File Corruption\n4. Large Files"
|
150 |
-
},
|
151 |
-
{
|
152 |
-
"cell_type": "markdown",
|
153 |
-
"metadata": {},
|
154 |
-
"source": "### How to Overcome Issues\n1. Use data imputation methods for missing data.\n2. Specify encoding when reading files (`encoding='utf-8'`).\n3. Repair or convert corrupted files.\n4. Process large files in chunks with `pandas`."
|
155 |
-
}
|
156 |
-
],
|
157 |
-
"metadata": {},
|
158 |
-
"nbformat": 4,
|
159 |
-
"nbformat_minor": 2
|
160 |
-
}
|
|
|
73 |
mime="application/octet-stream",
|
74 |
)
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|