Spaces:
Sleeping
Sleeping
Commit
·
eb0c31c
1
Parent(s):
f57ed01
app added
Browse files- .gitattributes +3 -0
- app.py +47 -13
- data.csv +0 -0
- data_final.csv +3 -0
- data_prev.csv +3 -0
- embeddings.txt +3 -0
- pages/main.py +19 -0
- project.ipynb +453 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
embeddings.txt filter=lfs diff=lfs merge=lfs -text
|
37 |
+
data_final.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
data_prev.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,19 +1,53 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
import
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
df = pd.read_csv('data.csv')
|
8 |
|
9 |
-
|
10 |
-
def show_random_rows():
|
11 |
-
random_rows = df.sample(10)[['author', 'title']]
|
12 |
-
st.table(random_rows)
|
13 |
|
14 |
-
|
15 |
-
st.title('Book recommender app')
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from transformers import AutoTokenizer, AutoModel
|
6 |
+
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
|
7 |
+
import faiss
|
8 |
|
9 |
+
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
10 |
+
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
|
|
11 |
|
12 |
+
df = pd.read_csv('data_final.csv')
|
|
|
|
|
|
|
13 |
|
14 |
+
MAX_LEN = 300
|
|
|
15 |
|
16 |
+
def embed_bert_cls(text, model, tokenizer):
|
17 |
+
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
18 |
+
with torch.no_grad():
|
19 |
+
model_output = model(**{k: v.to(model.device) for k, v in t.items()})
|
20 |
+
embeddings = model_output.last_hidden_state[:, 0, :]
|
21 |
+
embeddings = torch.nn.functional.normalize(embeddings)
|
22 |
+
return embeddings[0].cpu().numpy()
|
23 |
+
|
24 |
+
books_embs = np.loadtxt('embeddings.txt')
|
25 |
+
|
26 |
+
index = faiss.IndexFlatIP(books_embs.shape[1])
|
27 |
+
index.add(books_embs)
|
28 |
+
|
29 |
+
st.title('Приложение для рекомендации книг')
|
30 |
+
|
31 |
+
text = st.text_input('Введите запрос:')
|
32 |
+
top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
33 |
+
|
34 |
+
recommend_button = st.button('Найти')
|
35 |
+
|
36 |
+
if text and recommend_button:
|
37 |
+
query_emb = embed_bert_cls(text, model, tokenizer)
|
38 |
+
D, I = index.search(query_emb.reshape(1, -1), top_n)
|
39 |
+
|
40 |
+
st.subheader('Топ рекомендуемых книг:')
|
41 |
+
|
42 |
+
for i, j in zip(I[0], D[0]):
|
43 |
+
col_1, col_2 = st.columns([1, 3])
|
44 |
+
|
45 |
+
with col_1:
|
46 |
+
st.image(df['image_url'][i], use_column_width=True)
|
47 |
+
st.write(round(j* 100, 2))
|
48 |
+
with col_2:
|
49 |
+
st.write(f'Название книги: {df["title"][i]}')
|
50 |
+
st.write(f'Автор: {df["author"][i]}')
|
51 |
+
st.write(f'Ссылка: {df["page_url"][i]}')
|
52 |
+
st.write(f'Аннотация: {df["annotation"][i]}')
|
53 |
+
|
data.csv
DELETED
The diff for this file is too large to render.
See raw diff
|
|
data_final.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6656361b1e4b9e9a9200fe464ca227c0ce285af2150975072f48e548a0fbd1b7
|
3 |
+
size 32055320
|
data_prev.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aeb6a01f238ec78db9bf76ec1a12356eaececd9162bd38d91d838934b9a908aa
|
3 |
+
size 38974957
|
embeddings.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:631d7269f12874f12a6e7a732f431f75caeb00c16421234c5b07b7a814be113b
|
3 |
+
size 110210970
|
pages/main.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import random
|
4 |
+
|
5 |
+
# Load your dataset
|
6 |
+
# Replace 'your_dataset.csv' with the actual filename or path
|
7 |
+
df = pd.read_csv('data.csv')
|
8 |
+
|
9 |
+
# Function to display 10 random rows on button click
|
10 |
+
def show_random_rows():
|
11 |
+
random_rows = df.sample(10)[['author', 'title']]
|
12 |
+
st.table(random_rows)
|
13 |
+
|
14 |
+
# Streamlit app
|
15 |
+
st.title('Book recommender app')
|
16 |
+
|
17 |
+
# Button to trigger displaying random rows
|
18 |
+
if st.button('Show some books'):
|
19 |
+
show_random_rows()
|
project.ipynb
ADDED
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 43,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import pandas as pd"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 63,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [],
|
17 |
+
"source": [
|
18 |
+
"df0 = pd.read_csv('data_prev.csv')\n",
|
19 |
+
"df1 = pd.read_csv('data7.csv')\n",
|
20 |
+
"df2 = pd.read_csv('data6.csv')"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 64,
|
26 |
+
"metadata": {},
|
27 |
+
"outputs": [],
|
28 |
+
"source": [
|
29 |
+
"df = pd.concat([df0, df1, df2], ignore_index=True)"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 65,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [
|
37 |
+
{
|
38 |
+
"data": {
|
39 |
+
"text/plain": [
|
40 |
+
"page_url 0\n",
|
41 |
+
"image_url 0\n",
|
42 |
+
"author 2\n",
|
43 |
+
"title 0\n",
|
44 |
+
"annotation 0\n",
|
45 |
+
"category 0\n",
|
46 |
+
"dtype: int64"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
"execution_count": 65,
|
50 |
+
"metadata": {},
|
51 |
+
"output_type": "execute_result"
|
52 |
+
}
|
53 |
+
],
|
54 |
+
"source": [
|
55 |
+
"df.isna().sum()"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": 66,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"df.dropna(subset=['annotation', 'author'], inplace=True)"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 67,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [
|
72 |
+
{
|
73 |
+
"name": "stdout",
|
74 |
+
"output_type": "stream",
|
75 |
+
"text": [
|
76 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
77 |
+
"Index: 24385 entries, 0 to 24386\n",
|
78 |
+
"Data columns (total 6 columns):\n",
|
79 |
+
" # Column Non-Null Count Dtype \n",
|
80 |
+
"--- ------ -------------- ----- \n",
|
81 |
+
" 0 page_url 24385 non-null object\n",
|
82 |
+
" 1 image_url 24385 non-null object\n",
|
83 |
+
" 2 author 24385 non-null object\n",
|
84 |
+
" 3 title 24385 non-null object\n",
|
85 |
+
" 4 annotation 24385 non-null object\n",
|
86 |
+
" 5 category 24385 non-null object\n",
|
87 |
+
"dtypes: object(6)\n",
|
88 |
+
"memory usage: 1.3+ MB\n"
|
89 |
+
]
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"source": [
|
93 |
+
"df.info()"
|
94 |
+
]
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"cell_type": "code",
|
98 |
+
"execution_count": 68,
|
99 |
+
"metadata": {},
|
100 |
+
"outputs": [],
|
101 |
+
"source": [
|
102 |
+
"df.drop_duplicates(subset=['author', 'title'], inplace=True)"
|
103 |
+
]
|
104 |
+
},
|
105 |
+
{
|
106 |
+
"cell_type": "code",
|
107 |
+
"execution_count": 69,
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [
|
110 |
+
{
|
111 |
+
"name": "stdout",
|
112 |
+
"output_type": "stream",
|
113 |
+
"text": [
|
114 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
115 |
+
"Index: 24235 entries, 0 to 24385\n",
|
116 |
+
"Data columns (total 6 columns):\n",
|
117 |
+
" # Column Non-Null Count Dtype \n",
|
118 |
+
"--- ------ -------------- ----- \n",
|
119 |
+
" 0 page_url 24235 non-null object\n",
|
120 |
+
" 1 image_url 24235 non-null object\n",
|
121 |
+
" 2 author 24235 non-null object\n",
|
122 |
+
" 3 title 24235 non-null object\n",
|
123 |
+
" 4 annotation 24235 non-null object\n",
|
124 |
+
" 5 category 24235 non-null object\n",
|
125 |
+
"dtypes: object(6)\n",
|
126 |
+
"memory usage: 1.3+ MB\n"
|
127 |
+
]
|
128 |
+
}
|
129 |
+
],
|
130 |
+
"source": [
|
131 |
+
"df.info()"
|
132 |
+
]
|
133 |
+
},
|
134 |
+
{
|
135 |
+
"cell_type": "code",
|
136 |
+
"execution_count": 71,
|
137 |
+
"metadata": {},
|
138 |
+
"outputs": [
|
139 |
+
{
|
140 |
+
"data": {
|
141 |
+
"text/plain": [
|
142 |
+
"0 Евгений Онегин\n",
|
143 |
+
"1 Мастер и Маргарита (с иллюстрациями балета Сер...\n",
|
144 |
+
"2 Сто лет одиночества : [роман]\n",
|
145 |
+
"3 О дивный новый мир (замена картинки)\n",
|
146 |
+
"4 Дюна: [фантастический роман].\n",
|
147 |
+
" ... \n",
|
148 |
+
"24381 Игра в бисер\n",
|
149 |
+
"24382 Ромео и Джульетта\n",
|
150 |
+
"24383 Опасные связи : роман\n",
|
151 |
+
"24384 Тартарен из Тараскона\n",
|
152 |
+
"24385 Станция на горизонте\n",
|
153 |
+
"Name: title, Length: 24235, dtype: object"
|
154 |
+
]
|
155 |
+
},
|
156 |
+
"execution_count": 71,
|
157 |
+
"metadata": {},
|
158 |
+
"output_type": "execute_result"
|
159 |
+
}
|
160 |
+
],
|
161 |
+
"source": [
|
162 |
+
"df['title']"
|
163 |
+
]
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"cell_type": "code",
|
167 |
+
"execution_count": 72,
|
168 |
+
"metadata": {},
|
169 |
+
"outputs": [],
|
170 |
+
"source": [
|
171 |
+
"def has_partial_start(row, col_name):\n",
|
172 |
+
" name = row[col_name]\n",
|
173 |
+
" for other_name in df[col_name]:\n",
|
174 |
+
" if other_name != name and other_name.startswith(name):\n",
|
175 |
+
" return True\n",
|
176 |
+
" return False\n",
|
177 |
+
"\n",
|
178 |
+
"# def filter_partial_start(row, col_name):\n",
|
179 |
+
"# name = row[col_name]\n",
|
180 |
+
"# for other_name in df[col_name]:\n",
|
181 |
+
"# if other_name != name and other_name.startswith(name):\n",
|
182 |
+
"# return False\n",
|
183 |
+
"# return True\n",
|
184 |
+
"\n",
|
185 |
+
"# Применение функции для удаления строк с частичным совпадением в начале\n",
|
186 |
+
"filt_df = df[~df.apply(lambda x: has_partial_start(x, 'title'), axis=1)]"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"cell_type": "code",
|
191 |
+
"execution_count": 80,
|
192 |
+
"metadata": {},
|
193 |
+
"outputs": [],
|
194 |
+
"source": [
|
195 |
+
"pd.set_option('display.max_rows', None)"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"cell_type": "code",
|
200 |
+
"execution_count": 81,
|
201 |
+
"metadata": {},
|
202 |
+
"outputs": [
|
203 |
+
{
|
204 |
+
"data": {
|
205 |
+
"text/plain": [
|
206 |
+
"category\n",
|
207 |
+
"Социология 3802\n",
|
208 |
+
"Фантастика. Фэнтези 2731\n",
|
209 |
+
"Классическая и современная проза 2265\n",
|
210 |
+
"Детективы 1834\n",
|
211 |
+
"Эзотерическая литература 1133\n",
|
212 |
+
"Клиническая медицина. Внутренние болезни 640\n",
|
213 |
+
"Популярная психология 603\n",
|
214 |
+
"Бизнес. Торговля 574\n",
|
215 |
+
"Фольклор 542\n",
|
216 |
+
"Философия 520\n",
|
217 |
+
"Филология 485\n",
|
218 |
+
"Испанский, португальский, итальянский языки 465\n",
|
219 |
+
"Религия 461\n",
|
220 |
+
"Маркетинг 441\n",
|
221 |
+
"Публицистика. Биографии. Мемуары 440\n",
|
222 |
+
"Науки о Земле 330\n",
|
223 |
+
"Биологические науки 322\n",
|
224 |
+
"прочие языки 322\n",
|
225 |
+
"Книги по программированию 319\n",
|
226 |
+
"Политика. Партии и движения 277\n",
|
227 |
+
"Любовные романы. Книги о любви 275\n",
|
228 |
+
"Исторические романы 267\n",
|
229 |
+
"Поэзия. Драматургия 245\n",
|
230 |
+
"Естественные науки в целом. Науковедение 219\n",
|
231 |
+
"Народная медицина. Нетрадиционные методы лечения 200\n",
|
232 |
+
"Математика 196\n",
|
233 |
+
"Автоматика. Радиоэлектроника. Связь 188\n",
|
234 |
+
"Энергетика. Машиностроение. Приборостроение 179\n",
|
235 |
+
"Физика 147\n",
|
236 |
+
"Транспорт 138\n",
|
237 |
+
"Военная наука. Военное дело 133\n",
|
238 |
+
"Детская психология 116\n",
|
239 |
+
"Приключения 110\n",
|
240 |
+
"Защита информации. Компьютерная безопасность 104\n",
|
241 |
+
"Общая патология. Общая терапия 103\n",
|
242 |
+
"Домашние животные 101\n",
|
243 |
+
"Хирургия. Онкология. Прикладные отрасли медицины 101\n",
|
244 |
+
"Здравоохранение. Гигиена. Эпидемиология 98\n",
|
245 |
+
"Химические науки 97\n",
|
246 |
+
"Прикладное программное обеспечение 96\n",
|
247 |
+
"Педиатрия 90\n",
|
248 |
+
"Астрономия 87\n",
|
249 |
+
"Строительство 76\n",
|
250 |
+
"Горная промышленность. Металлургия 73\n",
|
251 |
+
"Психологические школы и направления. Психоанализ 72\n",
|
252 |
+
"Ветеринария 71\n",
|
253 |
+
"Курортология. Физиотерапия. Лечебная физкультура 66\n",
|
254 |
+
"Химическая промышленность. Пищевая промышленность. Легкая промышленность. Деревообрабатывающая и целлюлозно-бумажная промышленность. Полиграфия 59\n",
|
255 |
+
"Экология 58\n",
|
256 |
+
"Психология общения 55\n",
|
257 |
+
"Базы и банки данных. СУБД 39\n",
|
258 |
+
"Общая психология. История психологии 37\n",
|
259 |
+
"Технические науки и промышленность в целом 37\n",
|
260 |
+
"Операционные системы и программы-оболочки 33\n",
|
261 |
+
"Отдельные философские учения 30\n",
|
262 |
+
"Компьютерные сети. Интернет 29\n",
|
263 |
+
"Механика 27\n",
|
264 |
+
"Фармакология. Лекарствоведение. Токсикология 24\n",
|
265 |
+
"Прикладная психология. Соционика. Педагогическая психология 11\n",
|
266 |
+
"Статистика. Демография 9\n",
|
267 |
+
"Мобильные устройства 6\n",
|
268 |
+
"Name: count, dtype: int64"
|
269 |
+
]
|
270 |
+
},
|
271 |
+
"execution_count": 81,
|
272 |
+
"metadata": {},
|
273 |
+
"output_type": "execute_result"
|
274 |
+
}
|
275 |
+
],
|
276 |
+
"source": [
|
277 |
+
"filt_df['category'].value_counts()"
|
278 |
+
]
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"cell_type": "code",
|
282 |
+
"execution_count": 61,
|
283 |
+
"metadata": {},
|
284 |
+
"outputs": [
|
285 |
+
{
|
286 |
+
"name": "stdout",
|
287 |
+
"output_type": "stream",
|
288 |
+
"text": [
|
289 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
290 |
+
"Index: 1627 entries, 0 to 24382\n",
|
291 |
+
"Data columns (total 6 columns):\n",
|
292 |
+
" # Column Non-Null Count Dtype \n",
|
293 |
+
"--- ------ -------------- ----- \n",
|
294 |
+
" 0 page_url 1627 non-null object\n",
|
295 |
+
" 1 image_url 1627 non-null object\n",
|
296 |
+
" 2 author 1627 non-null object\n",
|
297 |
+
" 3 title 1627 non-null object\n",
|
298 |
+
" 4 annotation 1627 non-null object\n",
|
299 |
+
" 5 category 1627 non-null object\n",
|
300 |
+
"dtypes: object(6)\n",
|
301 |
+
"memory usage: 89.0+ KB\n"
|
302 |
+
]
|
303 |
+
}
|
304 |
+
],
|
305 |
+
"source": [
|
306 |
+
"df.info()"
|
307 |
+
]
|
308 |
+
},
|
309 |
+
{
|
310 |
+
"cell_type": "code",
|
311 |
+
"execution_count": 62,
|
312 |
+
"metadata": {},
|
313 |
+
"outputs": [
|
314 |
+
{
|
315 |
+
"data": {
|
316 |
+
"text/plain": [
|
317 |
+
"0 Евгений Онегин\n",
|
318 |
+
"8 Дюна\n",
|
319 |
+
"9 Малое собрание сочинений\n",
|
320 |
+
"13 Мастер и Маргарита\n",
|
321 |
+
"22 Бедная Лиза\n",
|
322 |
+
" ... \n",
|
323 |
+
"24339 Остров сокровищ\n",
|
324 |
+
"24348 Дети капитана Гранта\n",
|
325 |
+
"24379 Милый друг\n",
|
326 |
+
"24380 Саломея\n",
|
327 |
+
"24382 Ромео и Джульетта\n",
|
328 |
+
"Name: title, Length: 1627, dtype: object"
|
329 |
+
]
|
330 |
+
},
|
331 |
+
"execution_count": 62,
|
332 |
+
"metadata": {},
|
333 |
+
"output_type": "execute_result"
|
334 |
+
}
|
335 |
+
],
|
336 |
+
"source": [
|
337 |
+
"df['title']"
|
338 |
+
]
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"cell_type": "markdown",
|
342 |
+
"metadata": {},
|
343 |
+
"source": [
|
344 |
+
"### Фильтрация категорий\n"
|
345 |
+
]
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"cell_type": "code",
|
349 |
+
"execution_count": 35,
|
350 |
+
"metadata": {},
|
351 |
+
"outputs": [],
|
352 |
+
"source": [
|
353 |
+
"category_counts = df['category'].value_counts()\n",
|
354 |
+
"\n",
|
355 |
+
"# Фильтрация категорий с количеством значений больше 100\n",
|
356 |
+
"filtered_categories = category_counts[category_counts > 500].index\n",
|
357 |
+
"\n",
|
358 |
+
"# Фильтрация DataFrame по отфильтрованным категориям\n",
|
359 |
+
"filtered_df = df[df['category'].isin(filtered_categories)]"
|
360 |
+
]
|
361 |
+
},
|
362 |
+
{
|
363 |
+
"cell_type": "code",
|
364 |
+
"execution_count": 36,
|
365 |
+
"metadata": {},
|
366 |
+
"outputs": [
|
367 |
+
{
|
368 |
+
"data": {
|
369 |
+
"text/plain": [
|
370 |
+
"category\n",
|
371 |
+
"Социология 3954\n",
|
372 |
+
"Фантастика. Фэнтези 2916\n",
|
373 |
+
"Классическая и современная проза 2716\n",
|
374 |
+
"Детективы 1979\n",
|
375 |
+
"Эзотерическая литература 1175\n",
|
376 |
+
"Клиническая медицина. Внутренние болезни 647\n",
|
377 |
+
"Популярная психология 629\n",
|
378 |
+
"Фольклор 613\n",
|
379 |
+
"Философия 598\n",
|
380 |
+
"Бизнес. Торговля 588\n",
|
381 |
+
"Филология 503\n",
|
382 |
+
"Name: count, dtype: int64"
|
383 |
+
]
|
384 |
+
},
|
385 |
+
"execution_count": 36,
|
386 |
+
"metadata": {},
|
387 |
+
"output_type": "execute_result"
|
388 |
+
}
|
389 |
+
],
|
390 |
+
"source": [
|
391 |
+
"filtered_df['category'].value_counts()"
|
392 |
+
]
|
393 |
+
},
|
394 |
+
{
|
395 |
+
"cell_type": "code",
|
396 |
+
"execution_count": 82,
|
397 |
+
"metadata": {},
|
398 |
+
"outputs": [],
|
399 |
+
"source": [
|
400 |
+
"categories_list = ['Классическая и современная проза', 'Детективы', 'Фантастика. Фэнтези', 'Любовные романы. Книги о любви', 'Поэзия. Драматургия', 'Общая патология. Общая терапия', 'Народная медицина. Нетрадиционные методы лечения', 'Клиническая медицина. Внутренние болезни', 'Ветеринария', 'Домашние животные', 'Публицистика. Биографии. Мемуары', 'Эзотерическая литература', 'Философия', 'Религия', 'Социология', 'Фольклор', 'Бизнес. Торговля', 'Маркетинг', 'Книги по программированию', 'Политика. Партии и движения', 'Филология', 'Математика', 'Физика', 'Астрономия', 'Науки о Земле', 'Экология', 'Биологические науки', 'Популярная психология', 'Психологические школы и направления. Психоанализ', 'Детская психология', 'Приключения', 'Исторические романы']\n",
|
401 |
+
"filtered_df = filt_df[filt_df['category'].isin(categories_list)]"
|
402 |
+
]
|
403 |
+
},
|
404 |
+
{
|
405 |
+
"cell_type": "code",
|
406 |
+
"execution_count": 84,
|
407 |
+
"metadata": {},
|
408 |
+
"outputs": [],
|
409 |
+
"source": [
|
410 |
+
"filtered_df.to_csv('data_final.csv', index=False)"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
{
|
414 |
+
"cell_type": "code",
|
415 |
+
"execution_count": 29,
|
416 |
+
"metadata": {},
|
417 |
+
"outputs": [],
|
418 |
+
"source": [
|
419 |
+
"# df.to_csv('data_final.csv', index=False)"
|
420 |
+
]
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"cell_type": "code",
|
424 |
+
"execution_count": 83,
|
425 |
+
"metadata": {},
|
426 |
+
"outputs": [],
|
427 |
+
"source": [
|
428 |
+
"# filtered_df.to_csv('filtered_data_without_dubs.csv', index=False) "
|
429 |
+
]
|
430 |
+
}
|
431 |
+
],
|
432 |
+
"metadata": {
|
433 |
+
"kernelspec": {
|
434 |
+
"display_name": "faiss_env",
|
435 |
+
"language": "python",
|
436 |
+
"name": "python3"
|
437 |
+
},
|
438 |
+
"language_info": {
|
439 |
+
"codemirror_mode": {
|
440 |
+
"name": "ipython",
|
441 |
+
"version": 3
|
442 |
+
},
|
443 |
+
"file_extension": ".py",
|
444 |
+
"mimetype": "text/x-python",
|
445 |
+
"name": "python",
|
446 |
+
"nbconvert_exporter": "python",
|
447 |
+
"pygments_lexer": "ipython3",
|
448 |
+
"version": "3.10.12"
|
449 |
+
}
|
450 |
+
},
|
451 |
+
"nbformat": 4,
|
452 |
+
"nbformat_minor": 2
|
453 |
+
}
|