Spaces:
Sleeping
Sleeping
Commit
·
f57ed01
1
Parent(s):
7c84120
init
Browse files- parsing.py +73 -0
parsing.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
import requests
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
booklist = requests.get('https://www.biblio-globus.ru/catalog/categories')
|
7 |
+
main_path = 'https://www.biblio-globus.ru'
|
8 |
+
|
9 |
+
soup = BeautifulSoup(booklist.text, 'lxml')
|
10 |
+
categories = soup.find_all('li', attrs={'class': 'list-group-item'})
|
11 |
+
|
12 |
+
columns = ['page_url', 'image_url', 'author', 'title', 'annotation', 'category']
|
13 |
+
n = 0
|
14 |
+
total_categories = len(categories)
|
15 |
+
start_index = 0
|
16 |
+
|
17 |
+
for i in range(start_index, total_categories):
|
18 |
+
category = categories[i]
|
19 |
+
category_url = main_path + category.find('a').get('href')
|
20 |
+
category_page = requests.get(category_url)
|
21 |
+
category_soup = BeautifulSoup(category_page.text, 'lxml')
|
22 |
+
product_preview = category_soup.find_all('a', attrs={'class': 'product-preview-title'})
|
23 |
+
|
24 |
+
df = [] # Создаем новый DataFrame для каждой категории
|
25 |
+
|
26 |
+
for product in product_preview:
|
27 |
+
idd = product.get('href').split('/')[-1]
|
28 |
+
page = 1
|
29 |
+
|
30 |
+
while True:
|
31 |
+
book_list_url = f'https://www.biblio-globus.ru/catalog/category?id={idd}&page={page}&sort=0'
|
32 |
+
book_list_page = requests.get(book_list_url)
|
33 |
+
book_list_soup = BeautifulSoup(book_list_page.text, 'lxml')
|
34 |
+
book_list_links = book_list_soup.find_all('div', attrs={'class': 'text'})
|
35 |
+
|
36 |
+
if not book_list_links:
|
37 |
+
break
|
38 |
+
|
39 |
+
for book_link in book_list_links:
|
40 |
+
book_url = main_path + book_link.find('a').get('href')
|
41 |
+
book_page = requests.get(book_url)
|
42 |
+
book_soup = BeautifulSoup(book_page.text, 'lxml')
|
43 |
+
book_annotation = book_soup.find('div', id='collapseExample')
|
44 |
+
|
45 |
+
if book_annotation:
|
46 |
+
annotation = ''.join([symbol for symbol in book_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
|
47 |
+
annotation = annotation.split('Характеристики', 1)[0]
|
48 |
+
annotation = annotation.strip()
|
49 |
+
else:
|
50 |
+
annotation = None
|
51 |
+
|
52 |
+
try:
|
53 |
+
book_json = book_soup.find('script', attrs={'type': 'application/ld+json'})
|
54 |
+
dict_json = json.loads(book_json.text)
|
55 |
+
except (AttributeError, json.JSONDecodeError):
|
56 |
+
continue
|
57 |
+
|
58 |
+
author = dict_json['author']['name']
|
59 |
+
title = dict_json['name']
|
60 |
+
image = dict_json['image']
|
61 |
+
|
62 |
+
# Добавляем категорию в DataFrame
|
63 |
+
df.append([book_url, image, author, title, annotation, category.text])
|
64 |
+
|
65 |
+
# Выводим информацию о текущем прогрессе
|
66 |
+
print(f'Parsed book: {title} (Category: {category.text})')
|
67 |
+
|
68 |
+
page += 1
|
69 |
+
|
70 |
+
# Сохраняем CSV после обработки всех книг в текущей категории
|
71 |
+
data = pd.DataFrame(df, columns=columns)
|
72 |
+
data.to_csv(f'data{n}.csv', index=False)
|
73 |
+
n += 1
|