from bs4 import BeautifulSoup import requests import json import pandas as pd booklist = requests.get('https://www.biblio-globus.ru/catalog/categories') main_path = 'https://www.biblio-globus.ru' soup = BeautifulSoup(booklist.text, 'lxml') categories = soup.find_all('li', attrs={'class': 'list-group-item'}) columns = ['page_url', 'image_url', 'author', 'title', 'annotation', 'category'] n = 0 total_categories = len(categories) start_index = 0 for i in range(start_index, total_categories): category = categories[i] category_url = main_path + category.find('a').get('href') category_page = requests.get(category_url) category_soup = BeautifulSoup(category_page.text, 'lxml') product_preview = category_soup.find_all('a', attrs={'class': 'product-preview-title'}) df = [] # Создаем новый DataFrame для каждой категории for product in product_preview: idd = product.get('href').split('/')[-1] page = 1 while True: book_list_url = f'https://www.biblio-globus.ru/catalog/category?id={idd}&page={page}&sort=0' book_list_page = requests.get(book_list_url) book_list_soup = BeautifulSoup(book_list_page.text, 'lxml') book_list_links = book_list_soup.find_all('div', attrs={'class': 'text'}) if not book_list_links: break for book_link in book_list_links: book_url = main_path + book_link.find('a').get('href') book_page = requests.get(book_url) book_soup = BeautifulSoup(book_page.text, 'lxml') book_annotation = book_soup.find('div', id='collapseExample') if book_annotation: annotation = ''.join([symbol for symbol in book_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']]) annotation = annotation.split('Характеристики', 1)[0] annotation = annotation.strip() else: annotation = None try: book_json = book_soup.find('script', attrs={'type': 'application/ld+json'}) dict_json = json.loads(book_json.text) except (AttributeError, json.JSONDecodeError): continue author = dict_json['author']['name'] title = dict_json['name'] image = dict_json['image'] # Добавляем категорию в DataFrame df.append([book_url, image, author, title, annotation, category.text]) # Выводим информацию о текущем прогрессе print(f'Parsed book: {title} (Category: {category.text})') page += 1 # Сохраняем CSV после обработки всех книг в текущей категории data = pd.DataFrame(df, columns=columns) data.to_csv(f'data{n}.csv', index=False) n += 1