from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

booklist = requests.get('https://www.biblio-globus.ru/catalog/categories')
main_path = 'https://www.biblio-globus.ru'

soup = BeautifulSoup(booklist.text, 'lxml')
categories = soup.find_all('li', attrs={'class': 'list-group-item'})

columns = ['page_url', 'image_url', 'author', 'title', 'annotation', 'category']
n = 0
total_categories = len(categories)
start_index = 0

for i in range(start_index, total_categories):
    category = categories[i]
    category_url = main_path + category.find('a').get('href')
    category_page = requests.get(category_url)
    category_soup = BeautifulSoup(category_page.text, 'lxml')
    product_preview = category_soup.find_all('a', attrs={'class': 'product-preview-title'})

    df = []  # Создаем новый DataFrame для каждой категории

    for product in product_preview:
        idd = product.get('href').split('/')[-1]
        page = 1

        while True:
            book_list_url = f'https://www.biblio-globus.ru/catalog/category?id={idd}&page={page}&sort=0'
            book_list_page = requests.get(book_list_url)
            book_list_soup = BeautifulSoup(book_list_page.text, 'lxml')
            book_list_links = book_list_soup.find_all('div', attrs={'class': 'text'})

            if not book_list_links:
                break

            for book_link in book_list_links:
                book_url = main_path + book_link.find('a').get('href')
                book_page = requests.get(book_url)
                book_soup = BeautifulSoup(book_page.text, 'lxml')
                book_annotation = book_soup.find('div', id='collapseExample')

                if book_annotation:
                    annotation = ''.join([symbol for symbol in book_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
                    annotation = annotation.split('Характеристики', 1)[0]
                    annotation = annotation.strip()
                else:
                    annotation = None

                try:
                    book_json = book_soup.find('script', attrs={'type': 'application/ld+json'})
                    dict_json = json.loads(book_json.text)
                except (AttributeError, json.JSONDecodeError):
                    continue

                author = dict_json['author']['name']
                title = dict_json['name']
                image = dict_json['image']

                # Добавляем категорию в DataFrame
                df.append([book_url, image, author, title, annotation, category.text])

                # Выводим информацию о текущем прогрессе
                print(f'Parsed book: {title} (Category: {category.text})')

            page += 1

    # Сохраняем CSV после обработки всех книг в текущей категории
    data = pd.DataFrame(df, columns=columns)
    data.to_csv(f'data{n}.csv', index=False)
    n += 1