shwetashweta05
commited on
Update pages/6.Data Collection.py
Browse files- pages/6.Data Collection.py +75 -0
pages/6.Data Collection.py
CHANGED
@@ -75,3 +75,78 @@ if data_type == "Structured":
|
|
75 |
mime="application/octet-stream",
|
76 |
)
|
77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
mime="application/octet-stream",
|
76 |
)
|
77 |
|
78 |
+
# CSV Format Content
|
79 |
+
if format_selected == "CSV":
|
80 |
+
st.write("#### CSV Format")
|
81 |
+
|
82 |
+
# Part (a) What it is
|
83 |
+
st.subheader("What is CSV?")
|
84 |
+
st.write("""
|
85 |
+
CSV (Comma-Separated Values) is a plain-text file format used to store tabular data,
|
86 |
+
where each row corresponds to a record, and fields are separated by commas.
|
87 |
+
It is widely used for data exchange due to its simplicity and compatibility across systems.
|
88 |
+
Common file extensions include `.csv`.
|
89 |
+
""")
|
90 |
+
|
91 |
+
# Part (b) How to Read These Files
|
92 |
+
st.subheader("How to Read CSV Files?")
|
93 |
+
st.code("""
|
94 |
+
import pandas as pd
|
95 |
+
# Reading a CSV file
|
96 |
+
df = pd.read_csv("file.csv")
|
97 |
+
print(df.head())
|
98 |
+
|
99 |
+
# Reading a CSV file with custom delimiter
|
100 |
+
df = pd.read_csv("file.csv", sep=";")
|
101 |
+
""")
|
102 |
+
|
103 |
+
# Part (c) Issues Encountered
|
104 |
+
st.subheader("Common Issues Encountered When Handling CSV Files")
|
105 |
+
st.write("""
|
106 |
+
- **Incorrect Delimiters**: Files may use delimiters other than commas, e.g., semicolons or tabs.
|
107 |
+
- **Encoding Problems**: Files with different encodings (e.g., UTF-8, ISO-8859-1) may cause errors.
|
108 |
+
- **Missing or Corrupted Data**: Blank fields or inconsistencies in data.
|
109 |
+
- **Header Issues**: Missing headers or extra/unexpected columns.
|
110 |
+
- **Large File Sizes**: Memory limitations when processing large datasets.
|
111 |
+
""")
|
112 |
+
|
113 |
+
# Part (d) How to Overcome These Issues
|
114 |
+
st.subheader("How to Overcome These Issues?")
|
115 |
+
st.write("""
|
116 |
+
- **Incorrect Delimiters**: Specify the correct delimiter when reading:
|
117 |
+
```python
|
118 |
+
df = pd.read_csv("file.csv", sep=";")
|
119 |
+
```
|
120 |
+
- **Encoding Problems**: Specify the encoding explicitly:
|
121 |
+
```python
|
122 |
+
df = pd.read_csv("file.csv", encoding="utf-8")
|
123 |
+
```
|
124 |
+
- **Missing or Corrupted Data**: Handle missing values using pandas:
|
125 |
+
```python
|
126 |
+
df.fillna("NA", inplace=True)
|
127 |
+
```
|
128 |
+
- **Header Issues**: Assign custom headers or skip problematic rows:
|
129 |
+
```python
|
130 |
+
df = pd.read_csv("file.csv", header=None)
|
131 |
+
df.columns = ["Column1", "Column2", "Column3"]
|
132 |
+
```
|
133 |
+
- **Large Files**: Use chunk processing for large files:
|
134 |
+
```python
|
135 |
+
chunks = pd.read_csv("file.csv", chunksize=1000)
|
136 |
+
for chunk in chunks:
|
137 |
+
process(chunk)
|
138 |
+
```
|
139 |
+
""")
|
140 |
+
|
141 |
+
# Downloadable Guide Button
|
142 |
+
st.markdown("### Download Coding Guide:")
|
143 |
+
if st.button("Download CSV Guide"):
|
144 |
+
# Provide a downloadable Jupyter Notebook file
|
145 |
+
file_path = "CSV_guide.ipynb" # Replace with the actual file path
|
146 |
+
with open(file_path, "rb") as file:
|
147 |
+
st.download_button(
|
148 |
+
label="Download CSV Guide",
|
149 |
+
data=file,
|
150 |
+
file_name="CSV_guide.ipynb",
|
151 |
+
mime="application/octet-stream",
|
152 |
+
)
|