awacke1 commited on
Commit
dcaedc3
·
1 Parent(s): 3b59fe8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +216 -28
app.py CHANGED
@@ -4,12 +4,73 @@ import os
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
 
 
 
 
 
7
 
8
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # Create a history.txt file if it doesn't exist yet
11
- with open("history.txt", "a+") as f:
12
- f.close()
13
 
14
  def download_file(url, local_filename):
15
  if url.startswith('http://') or url.startswith('https://'):
@@ -23,54 +84,181 @@ def download_file(url, local_filename):
23
  except requests.exceptions.HTTPError as err:
24
  print(f"HTTP error occurred: {err}")
25
 
26
- def download_html_and_files(url):
27
  html_content = requests.get(url).text
28
  soup = BeautifulSoup(html_content, 'html.parser')
29
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
 
30
  for link in soup.find_all('a'):
31
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
32
- local_filename = urllib.parse.urlparse(file_url).path.split('/')[-1]
33
- if local_filename:
 
34
  link['href'] = local_filename
35
  download_file(file_url, local_filename)
36
- with open("index.html", "w") as file:
 
37
  file.write(str(soup))
38
 
39
  def list_files(directory_path='.'):
40
  files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
41
  return [f for f in files if f not in EXCLUDED_FILES]
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def get_download_link(file):
44
  with open(file, "rb") as f:
45
  bytes = f.read()
46
  b64 = base64.b64encode(bytes).decode()
47
- href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{file}\'>Click to download {file}</a>'
48
  return href
49
-
50
- def show_download_links():
51
- st.sidebar.write('Here are the files you can download:')
52
- for file in list_files():
53
- st.sidebar.markdown(get_download_link(file), unsafe_allow_html=True)
54
-
55
  def main():
56
  st.sidebar.title('Web Datasets Bulk Downloader')
57
- url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
58
 
59
- # Save the history of URL entered as a text file
60
- if url:
61
- with open("history.txt", "a") as f:
62
- f.write(url + "\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- if st.sidebar.button('📥 Get All the Content'):
65
- download_html_and_files(url)
66
- show_download_links()
67
- if st.sidebar.button('📂 Show Download Links'):
68
- show_download_links()
69
 
70
- # Display history as markdown
71
- with open("history.txt", "r") as f:
72
- history = f.read()
73
- st.markdown(f"### History\n\n{history}")
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  if __name__ == "__main__":
76
  main()
 
4
  import urllib
5
  import base64
6
  from bs4 import BeautifulSoup
7
+ import hashlib
8
+ import json
9
+ import uuid
10
+ import glob
11
+ import zipfile
12
 
13
  EXCLUDED_FILES = ['app.py', 'requirements.txt', 'pre-requirements.txt', 'packages.txt', 'README.md','.gitattributes', "backup.py","Dockerfile"]
14
+ URLS = {
15
+ "Chordify - Play Along Chords": "https://chordify.net/",
16
+ "National Guitar Academy - Guitar Learning": "https://www.guitaracademy.com/",
17
+ "Ultimate Guitar - Massive Song Database": "https://www.ultimate-guitar.com/",
18
+ "Wolf Alice": "https://www.chordie.com/song.php/songartist/Wolf+Alice/index.html",
19
+ "Everclear": "https://www.chordie.com/song.php/songartist/Everclear/index.html",
20
+ "Jungle": "https://www.ultimate-guitar.com/artist/jungle_47745",
21
+ "Mylie Cyrus": "https://www.ultimate-guitar.com/search.php?title=mile+cyrus&spelling=Mylie+cyrus",
22
+ "Kanye": "https://www.ultimate-guitar.com/search.php?search_type=title&value=Kanye%20west",
23
+ "Cat Stevens": "https://www.ultimate-guitar.com/search.php?search_type=title&value=cat%20stevens",
24
+ "Metric": "https://www.ultimate-guitar.com/search.php?search_type=title&value=Metric",
25
+ "John Lennon": "https://www.ultimate-guitar.com/search.php?search_type=title&value=John%20Lennon",
26
+ }
27
+
28
+ if not os.path.exists("history.json"):
29
+ with open("history.json", "w") as f:
30
+ json.dump({}, f)
31
+
32
+ import os
33
+ import base64
34
+ import zipfile
35
+ import streamlit as st
36
+
37
+ def zip_subdirs(start_dir):
38
+ for subdir, dirs, files in os.walk(start_dir):
39
+ if subdir != start_dir: # Skip the root directory
40
+ zip_filename = os.path.join(start_dir, subdir.split(os.sep)[-1] + '.zip')
41
+ with zipfile.ZipFile(zip_filename, 'w') as zipf:
42
+ for file in files:
43
+ file_path = os.path.join(subdir, file)
44
+ zipf.write(file_path, os.path.relpath(file_path, start_dir))
45
+ st.write(f"Added: {file_path}")
46
+ yield zip_filename
47
+
48
+ def get_zip_download_link(zip_file):
49
+ with open(zip_file, 'rb') as f:
50
+ bytes = f.read()
51
+ b64 = base64.b64encode(bytes).decode()
52
+ link_name = os.path.basename(zip_file)
53
+ href = f'<a href="data:file/zip;base64,{b64}" download="{link_name}">Download: {link_name}</a>'
54
+ return href
55
+
56
+
57
+ @st.cache_resource
58
+ def create_zip_of_files(files):
59
+ zip_name = "all_files.zip"
60
+ with zipfile.ZipFile(zip_name, 'w') as zipf:
61
+ for file in files:
62
+ zipf.write(file)
63
+ return zip_name
64
+
65
+ @st.cache_resource
66
+ def get_zip_download_link(zip_file):
67
+ with open(zip_file, 'rb') as f:
68
+ data = f.read()
69
+ b64 = base64.b64encode(data).decode()
70
+ href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
71
+ return href
72
+
73
 
 
 
 
74
 
75
  def download_file(url, local_filename):
76
  if url.startswith('http://') or url.startswith('https://'):
 
84
  except requests.exceptions.HTTPError as err:
85
  print(f"HTTP error occurred: {err}")
86
 
87
+ def download_html_and_files(url, subdir):
88
  html_content = requests.get(url).text
89
  soup = BeautifulSoup(html_content, 'html.parser')
90
  base_url = urllib.parse.urlunparse(urllib.parse.urlparse(url)._replace(path='', params='', query='', fragment=''))
91
+
92
  for link in soup.find_all('a'):
93
  file_url = urllib.parse.urljoin(base_url, link.get('href'))
94
+ local_filename = os.path.join(subdir, urllib.parse.urlparse(file_url).path.split('/')[-1])
95
+
96
+ if not local_filename.endswith('/') and local_filename != subdir:
97
  link['href'] = local_filename
98
  download_file(file_url, local_filename)
99
+
100
+ with open(os.path.join(subdir, "index.html"), "w") as file:
101
  file.write(str(soup))
102
 
103
  def list_files(directory_path='.'):
104
  files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
105
  return [f for f in files if f not in EXCLUDED_FILES]
106
 
107
+ def file_editor(file_path):
108
+ st.write(f"Editing File: {os.path.basename(file_path)}")
109
+ file_content = ""
110
+
111
+ with open(file_path, "r") as f:
112
+ file_content = f.read()
113
+
114
+ file_content = st.text_area("Edit the file content:", value=file_content, height=250)
115
+
116
+ if st.button("💾 Save"):
117
+ with open(file_path, "w") as f:
118
+ f.write(file_content)
119
+ st.success(f"File '{os.path.basename(file_path)}' saved!")
120
+
121
+
122
+ def show_file_operations(file_path, sequence_number):
123
+ #st.write(f"File: {os.path.basename(file_path)}")
124
+ unique_key = hashlib.md5(file_path.encode()).hexdigest()
125
+ file_content = ""
126
+
127
+ col01, col02, col1, col2, col3 = st.columns(5)
128
+ with col01:
129
+ st.write(os.path.basename(file_path))
130
+ #with col02:
131
+ #st.write(file_path)
132
+ with col1:
133
+ edit_key = f"edit_{unique_key}_{sequence_number}"
134
+ if st.button(f"✏️ Edit", key=edit_key):
135
+ with open(file_path, "r") as f:
136
+ file_content = f.read()
137
+ text_area_key = f"text_area_{unique_key}_{sequence_number}"
138
+ file_content = st.text_area("Edit the file content:", value=file_content, height=250, key=text_area_key)
139
+
140
+ with col2:
141
+ save_key = f"save_{unique_key}_{sequence_number}"
142
+ if st.button(f"💾 Save", key=save_key):
143
+ if file_content: # Ensure file_content is not empty
144
+ with open(file_path, "w") as f:
145
+ f.write(file_content)
146
+ st.success(f"File saved!")
147
+
148
+ with col3:
149
+ delete_key = f"delete_{unique_key}_{sequence_number}"
150
+ if st.button(f"🗑️ Delete", key=delete_key):
151
+ os.remove(file_path)
152
+ st.markdown(f"File deleted!")
153
+
154
+
155
+ file_sequence_numbers = {}
156
+
157
+ def show_download_links(subdir):
158
+ global file_sequence_numbers
159
+ for file in list_files(subdir):
160
+ file_path = os.path.join(subdir, file)
161
+ if file_path not in file_sequence_numbers:
162
+ file_sequence_numbers[file_path] = 1
163
+ else:
164
+ file_sequence_numbers[file_path] += 1
165
+ sequence_number = file_sequence_numbers[file_path]
166
+
167
+ if os.path.isfile(file_path):
168
+ st.markdown(get_download_link(file_path), unsafe_allow_html=True)
169
+ show_file_operations(file_path, sequence_number)
170
+ else:
171
+ st.write(f"File not found: {file}")
172
+
173
  def get_download_link(file):
174
  with open(file, "rb") as f:
175
  bytes = f.read()
176
  b64 = base64.b64encode(bytes).decode()
177
+ href = f'<a href="data:file/octet-stream;base64,{b64}" download=\'{os.path.basename(file)}\'>Download: {os.path.basename(file)}</a>'
178
  return href
179
+
 
 
 
 
 
180
  def main():
181
  st.sidebar.title('Web Datasets Bulk Downloader')
 
182
 
183
+ # Check for query parameters for file editing
184
+ query_params = st.experimental_get_query_params()
185
+ file_to_edit = query_params.get('file_to_edit', [None])[0]
186
+
187
+ if file_to_edit and os.path.exists(file_to_edit):
188
+ file_editor(file_to_edit)
189
+ else:
190
+ # Selecting URL input method
191
+ url_input_method = st.sidebar.radio("Choose URL Input Method", ["Enter URL", "Select from List"])
192
+ url = ""
193
+ if url_input_method == "Enter URL":
194
+ url = st.sidebar.text_input('Please enter a Web URL to bulk download text and files')
195
+ else:
196
+ selected_site = st.sidebar.selectbox("Select a Website", list(URLS.keys()))
197
+ url = URLS[selected_site]
198
+
199
+ # Reading or creating history.json
200
+ if not os.path.exists("history.json"):
201
+ with open("history.json", "w") as f:
202
+ json.dump({}, f)
203
+
204
+ with open("history.json", "r") as f:
205
+ try:
206
+ history = json.load(f)
207
+ except:
208
+ print('error')
209
+
210
+ # Handling URL submission
211
+ if url:
212
+ subdir = hashlib.md5(url.encode()).hexdigest()
213
+ if not os.path.exists(subdir):
214
+ os.makedirs(subdir)
215
+ if url not in history:
216
+ history[url] = subdir
217
+ with open("history.json", "w") as f:
218
+ json.dump(history, f)
219
+
220
+ # Button for downloading content
221
+ if st.sidebar.button('📥 Get All the Content'):
222
+ download_html_and_files(url, history[url])
223
+ show_download_links(history[url])
224
 
225
+ # Button for showing download links
226
+ if st.sidebar.button('📂 Show Download Links'):
227
+ for subdir in history.values():
228
+ show_download_links(subdir)
 
229
 
 
 
 
 
230
 
231
+ if st.sidebar.button("🗑 Delete All"):
232
+ # Clear history file
233
+ with open("history.json", "w") as f:
234
+ json.dump({}, f)
235
+
236
+ # Delete all files in subdirectories
237
+ for subdir in glob.glob('*'):
238
+ if os.path.isdir(subdir) and subdir not in EXCLUDED_FILES:
239
+ for file in os.listdir(subdir):
240
+ file_path = os.path.join(subdir, file)
241
+ os.remove(file_path)
242
+ st.write(f"Deleted: {file_path}")
243
+ os.rmdir(subdir) # Remove the empty directory
244
+
245
+ st.experimental_rerun()
246
+
247
+ if st.sidebar.button("⬇️ Download All"):
248
+ start_directory = '.' # Current directory
249
+ for zip_file in zip_subdirs(start_directory):
250
+ st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
251
+
252
+ # Expander for showing URL history and download links
253
+ with st.expander("URL History and Downloaded Files"):
254
+ try:
255
+ for url, subdir in history.items():
256
+ st.markdown(f"#### {url}")
257
+ show_download_links(subdir)
258
+ except:
259
+ print('url history is empty')
260
+ # Update each time to show files we have
261
+ for subdir in history.values():
262
+ show_download_links(subdir)
263
  if __name__ == "__main__":
264
  main()