valeriedaash commited on
Commit
f57ed01
·
1 Parent(s): 7c84120
Files changed (1) hide show
  1. parsing.py +73 -0
parsing.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ import requests
3
+ import json
4
+ import pandas as pd
5
+
6
+ booklist = requests.get('https://www.biblio-globus.ru/catalog/categories')
7
+ main_path = 'https://www.biblio-globus.ru'
8
+
9
+ soup = BeautifulSoup(booklist.text, 'lxml')
10
+ categories = soup.find_all('li', attrs={'class': 'list-group-item'})
11
+
12
+ columns = ['page_url', 'image_url', 'author', 'title', 'annotation', 'category']
13
+ n = 0
14
+ total_categories = len(categories)
15
+ start_index = 0
16
+
17
+ for i in range(start_index, total_categories):
18
+ category = categories[i]
19
+ category_url = main_path + category.find('a').get('href')
20
+ category_page = requests.get(category_url)
21
+ category_soup = BeautifulSoup(category_page.text, 'lxml')
22
+ product_preview = category_soup.find_all('a', attrs={'class': 'product-preview-title'})
23
+
24
+ df = [] # Создаем новый DataFrame для каждой категории
25
+
26
+ for product in product_preview:
27
+ idd = product.get('href').split('/')[-1]
28
+ page = 1
29
+
30
+ while True:
31
+ book_list_url = f'https://www.biblio-globus.ru/catalog/category?id={idd}&page={page}&sort=0'
32
+ book_list_page = requests.get(book_list_url)
33
+ book_list_soup = BeautifulSoup(book_list_page.text, 'lxml')
34
+ book_list_links = book_list_soup.find_all('div', attrs={'class': 'text'})
35
+
36
+ if not book_list_links:
37
+ break
38
+
39
+ for book_link in book_list_links:
40
+ book_url = main_path + book_link.find('a').get('href')
41
+ book_page = requests.get(book_url)
42
+ book_soup = BeautifulSoup(book_page.text, 'lxml')
43
+ book_annotation = book_soup.find('div', id='collapseExample')
44
+
45
+ if book_annotation:
46
+ annotation = ''.join([symbol for symbol in book_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
47
+ annotation = annotation.split('Характеристики', 1)[0]
48
+ annotation = annotation.strip()
49
+ else:
50
+ annotation = None
51
+
52
+ try:
53
+ book_json = book_soup.find('script', attrs={'type': 'application/ld+json'})
54
+ dict_json = json.loads(book_json.text)
55
+ except (AttributeError, json.JSONDecodeError):
56
+ continue
57
+
58
+ author = dict_json['author']['name']
59
+ title = dict_json['name']
60
+ image = dict_json['image']
61
+
62
+ # Добавляем категорию в DataFrame
63
+ df.append([book_url, image, author, title, annotation, category.text])
64
+
65
+ # Выводим информацию о текущем прогрессе
66
+ print(f'Parsed book: {title} (Category: {category.text})')
67
+
68
+ page += 1
69
+
70
+ # Сохраняем CSV после обработки всех книг в текущей категории
71
+ data = pd.DataFrame(df, columns=columns)
72
+ data.to_csv(f'data{n}.csv', index=False)
73
+ n += 1