Spaces:

valeriedaash
/

find_my_book

Sleeping

App Files Files Community

valeriedaash commited on Mar 13, 2024

Commit

f57ed01

1 Parent(s): 7c84120

init

Browse files

Files changed (1) hide show

parsing.py +73 -0

parsing.py ADDED Viewed

	@@ -0,0 +1,73 @@

+from bs4 import BeautifulSoup
+import requests
+import json
+import pandas as pd
+booklist = requests.get('https://www.biblio-globus.ru/catalog/categories')
+main_path = 'https://www.biblio-globus.ru'
+soup = BeautifulSoup(booklist.text, 'lxml')
+categories = soup.find_all('li', attrs={'class': 'list-group-item'})
+columns = ['page_url', 'image_url', 'author', 'title', 'annotation', 'category']
+n = 0
+total_categories = len(categories)
+start_index = 0
+for i in range(start_index, total_categories):
+    category = categories[i]
+    category_url = main_path + category.find('a').get('href')
+    category_page = requests.get(category_url)
+    category_soup = BeautifulSoup(category_page.text, 'lxml')
+    product_preview = category_soup.find_all('a', attrs={'class': 'product-preview-title'})
+    df = []  # Создаем новый DataFrame для каждой категории
+    for product in product_preview:
+        idd = product.get('href').split('/')[-1]
+        page = 1
+        while True:
+            book_list_url = f'https://www.biblio-globus.ru/catalog/category?id={idd}&page={page}&sort=0'
+            book_list_page = requests.get(book_list_url)
+            book_list_soup = BeautifulSoup(book_list_page.text, 'lxml')
+            book_list_links = book_list_soup.find_all('div', attrs={'class': 'text'})
+            if not book_list_links:
+                break
+            for book_link in book_list_links:
+                book_url = main_path + book_link.find('a').get('href')
+                book_page = requests.get(book_url)
+                book_soup = BeautifulSoup(book_page.text, 'lxml')
+                book_annotation = book_soup.find('div', id='collapseExample')
+                if book_annotation:
+                    annotation = ''.join([symbol for symbol in book_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
+                    annotation = annotation.split('Характеристики', 1)[0]
+                    annotation = annotation.strip()
+                else:
+                    annotation = None
+                try:
+                    book_json = book_soup.find('script', attrs={'type': 'application/ld+json'})
+                    dict_json = json.loads(book_json.text)
+                except (AttributeError, json.JSONDecodeError):
+                    continue
+                author = dict_json['author']['name']
+                title = dict_json['name']
+                image = dict_json['image']
+                # Добавляем категорию в DataFrame
+                df.append([book_url, image, author, title, annotation, category.text])
+                # Выводим информацию о текущем прогрессе
+                print(f'Parsed book: {title} (Category: {category.text})')
+            page += 1
+    # Сохраняем CSV после обработки всех книг в текущей категории
+    data = pd.DataFrame(df, columns=columns)
+    data.to_csv(f'data{n}.csv', index=False)
+    n += 1