WGlint
/

Sfya_test

Sfya_test / test3 /kohya_ss /tools /convert_html_to_md.py

Upload folder using huggingface_hub

86f155b verified 11 months ago

1.35 kB

	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin
	from html2text import html2text

	# Specify the URL of the webpage you want to scrape
	url = "https://hoshikat-hatenablog-com.translate.goog/entry/2023/05/26/223229?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en-US&_x_tr_pto=wapp"

	# Send HTTP request to the specified URL and save the response from server in a response object called r
	r = requests.get(url)

	# Create a BeautifulSoup object and specify the parser
	soup = BeautifulSoup(r.text, 'html.parser')

	# Find all image tags
	images = soup.find_all('img')

	for image in images:
	# Get the image source
	image_url = urljoin(url, image['src'])

	# Get the image response
	image_response = requests.get(image_url, stream=True)

	# Get the image name by splitting the url at / and taking the last string, and add it to the desired path
	image_name = os.path.join("./logs", image_url.split("/")[-1])

	# Open the image file in write binary mode
	with open(image_name, 'wb') as file:
	# Write the image data to the file
	file.write(image_response.content)

	# Convert the HTML content to markdown
	markdown_content = html2text(r.text)

	# Save the markdown content to a file
	with open("converted_markdown.md", "w", encoding="utf8") as file:
	file.write(markdown_content)