Spaces:
Sleeping
Sleeping
Commit
·
99c7207
1
Parent(s):
edf802b
filtering added
Browse files- README.md +14 -0
- app.py +213 -19
- pages/main.py +0 -19
README.md
CHANGED
@@ -10,3 +10,17 @@ pinned: false
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
13 |
+
## Умный поиск книг
|
14 |
+
|
15 |
+
## 🦸♂️Команда
|
16 |
+
1. [Валерия Дашиева](https://github.com/valeriedaash)
|
17 |
+
2. [Марина Кочетова](https://github.com/neonanet)
|
18 |
+
|
19 |
+
## 🎯 Задача
|
20 |
+
Разработать систему поиска книги по пользовательскому запросу. Сервис должен принимать на вход описание книги от пользователя и возвращать заданное количество подходящих вариантов.
|
21 |
+
|
22 |
+
## Как пользоваться
|
23 |
+
для локального запуска:
|
24 |
+
1) скопируйте репозиторий на свой компьютер
|
25 |
+
2) установите необходимые библеотеки из requirements.txt
|
26 |
+
3) запустите app.py
|
app.py
CHANGED
@@ -1,9 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
from transformers import AutoTokenizer, AutoModel
|
6 |
-
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
|
7 |
import faiss
|
8 |
|
9 |
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
@@ -12,6 +182,12 @@ model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
|
12 |
df = pd.read_csv('data_final.csv')
|
13 |
|
14 |
MAX_LEN = 300
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def embed_bert_cls(text, model, tokenizer):
|
17 |
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
@@ -21,13 +197,28 @@ def embed_bert_cls(text, model, tokenizer):
|
|
21 |
embeddings = torch.nn.functional.normalize(embeddings)
|
22 |
return embeddings[0].cpu().numpy()
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
index =
|
27 |
-
index.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
st.title('Приложение для рекомендации книг')
|
30 |
|
|
|
|
|
31 |
text = st.text_input('Введите запрос:')
|
32 |
top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
33 |
|
@@ -35,19 +226,22 @@ recommend_button = st.button('Найти')
|
|
35 |
|
36 |
if text and recommend_button:
|
37 |
query_emb = embed_bert_cls(text, model, tokenizer)
|
38 |
-
|
39 |
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
-
for i, j in zip(I[0], D[0]):
|
43 |
-
col_1, col_2 = st.columns([1, 3])
|
44 |
-
|
45 |
-
with col_1:
|
46 |
-
st.image(df['image_url'][i], use_column_width=True)
|
47 |
-
st.write(round(j* 100, 2))
|
48 |
-
with col_2:
|
49 |
-
st.write(f'Название книги: **{df["title"][i]}**')
|
50 |
-
st.write(f'Автор: {df["author"][i]}')
|
51 |
-
st.write(f'Ссылка: {df["page_url"][i]}')
|
52 |
-
st.write(f'Аннотация: {df["annotation"][i]}')
|
53 |
-
|
|
|
1 |
+
# БЕЗ ФИЛЬТРА КАТЕГОРИЙ
|
2 |
+
# import streamlit as st
|
3 |
+
# import pandas as pd
|
4 |
+
# import numpy as np
|
5 |
+
# import torch
|
6 |
+
# from transformers import AutoTokenizer, AutoModel
|
7 |
+
# from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
|
8 |
+
# import faiss
|
9 |
+
|
10 |
+
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
11 |
+
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
12 |
+
|
13 |
+
# df = pd.read_csv('data_final.csv')
|
14 |
+
|
15 |
+
# MAX_LEN = 300
|
16 |
+
|
17 |
+
# def embed_bert_cls(text, model, tokenizer):
|
18 |
+
# t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
19 |
+
# with torch.no_grad():
|
20 |
+
# model_output = model(**{k: v.to(model.device) for k, v in t.items()})
|
21 |
+
# embeddings = model_output.last_hidden_state[:, 0, :]
|
22 |
+
# embeddings = torch.nn.functional.normalize(embeddings)
|
23 |
+
# return embeddings[0].cpu().numpy()
|
24 |
+
|
25 |
+
# books_embs = np.loadtxt('vectors.txt')
|
26 |
+
|
27 |
+
# index = faiss.IndexFlatIP(books_embs.shape[1])
|
28 |
+
# index.add(books_embs)
|
29 |
+
|
30 |
+
# st.title('Приложение для рекомендации книг')
|
31 |
+
|
32 |
+
# text = st.text_input('Введите запрос:')
|
33 |
+
# top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
34 |
+
|
35 |
+
# recommend_button = st.button('Найти')
|
36 |
+
|
37 |
+
# if text and recommend_button:
|
38 |
+
# query_emb = embed_bert_cls(text, model, tokenizer)
|
39 |
+
# D, I = index.search(query_emb.reshape(1, -1), top_n)
|
40 |
+
|
41 |
+
# st.subheader('Топ рекомендуемых книг:')
|
42 |
+
|
43 |
+
# for i, j in zip(I[0], D[0]):
|
44 |
+
# col_1, col_2 = st.columns([1, 3])
|
45 |
+
|
46 |
+
# with col_1:
|
47 |
+
# st.image(df['image_url'][i], use_column_width=True)
|
48 |
+
# st.write(round(j* 100, 2))
|
49 |
+
# with col_2:
|
50 |
+
# st.write(f'Название книги: **{df["title"][i]}**')
|
51 |
+
# st.write(f'Автор: {df["author"][i]}')
|
52 |
+
# st.write(f'Ссылка: {df["page_url"][i]}')
|
53 |
+
# st.write(f'Аннотация: {df["annotation"][i]}')
|
54 |
+
# БЕЗ КЭШИРОВАНИЯ
|
55 |
+
# import streamlit as st
|
56 |
+
# import pandas as pd
|
57 |
+
# import numpy as np
|
58 |
+
# import torch
|
59 |
+
# from transformers import AutoTokenizer, AutoModel
|
60 |
+
# from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
|
61 |
+
# import faiss
|
62 |
+
|
63 |
+
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
64 |
+
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
65 |
+
|
66 |
+
# df = pd.read_csv('data_final.csv')
|
67 |
+
|
68 |
+
# MAX_LEN = 300
|
69 |
+
|
70 |
+
# def embed_bert_cls(text, model, tokenizer):
|
71 |
+
# t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
72 |
+
# with torch.no_grad():
|
73 |
+
# model_output = model(**{k: v.to(model.device) for k, v in t.items()})
|
74 |
+
# embeddings = model_output.last_hidden_state[:, 0, :]
|
75 |
+
# embeddings = torch.nn.functional.normalize(embeddings)
|
76 |
+
# return embeddings[0].cpu().numpy()
|
77 |
+
|
78 |
+
# books_embs = np.loadtxt('vectors.txt')
|
79 |
+
|
80 |
+
# index = faiss.IndexFlatIP(books_embs.shape[1])
|
81 |
+
# index.add(books_embs)
|
82 |
+
|
83 |
+
# st.title('Приложение для рекомендации книг')
|
84 |
+
|
85 |
+
# # Добавляем опциональный фильтр для выбора категории книги
|
86 |
+
# category_filter = st.selectbox('Выберите категорию книги (необязательно)', ['Все'] + list(df['category'].unique()))
|
87 |
+
|
88 |
+
# text = st.text_input('Введите запрос:')
|
89 |
+
# top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
90 |
+
|
91 |
+
# recommend_button = st.button('Найти')
|
92 |
+
|
93 |
+
# if text and recommend_button:
|
94 |
+
# query_emb = embed_bert_cls(text, model, tokenizer)
|
95 |
+
# D, I = index.search(query_emb.reshape(1, -1), top_n)
|
96 |
+
|
97 |
+
# st.subheader('Топ рекомендуемых книг:')
|
98 |
+
|
99 |
+
# for i, j in zip(I[0], D[0]):
|
100 |
+
# # Добавляем фильтрацию по выбранной категории книги, если выбрана конкретная категория
|
101 |
+
# if category_filter == 'Все' or df['category'][i] == category_filter:
|
102 |
+
# col_1, col_2 = st.columns([1, 3])
|
103 |
+
|
104 |
+
# with col_1:
|
105 |
+
# st.image(df['image_url'][i], use_column_width=True)
|
106 |
+
# st.write(round(j* 100, 2))
|
107 |
+
# with col_2:
|
108 |
+
# st.write(f'Название книги: **{df["title"][i]}**')
|
109 |
+
# st.write(f'Автор: {df["author"][i]}')
|
110 |
+
# st.write(f'Ссылка: {df["page_url"][i]}')
|
111 |
+
# st.write(f'Аннотация: {df["annotation"][i]}')
|
112 |
+
# import streamlit as st
|
113 |
+
# import pandas as pd
|
114 |
+
# import numpy as np
|
115 |
+
# import torch
|
116 |
+
# from transformers import AutoTokenizer, AutoModel
|
117 |
+
# from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
|
118 |
+
# import faiss
|
119 |
+
|
120 |
+
# tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
121 |
+
# model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
|
122 |
+
|
123 |
+
# df = pd.read_csv('data_final.csv')
|
124 |
+
|
125 |
+
# MAX_LEN = 300
|
126 |
+
|
127 |
+
# # @st.cache(hash_funcs={tokenizers.Tokenizer: my_hash_func})
|
128 |
+
# def embed_bert_cls(text, model, tokenizer):
|
129 |
+
# t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
130 |
+
# with torch.no_grad():
|
131 |
+
# model_output = model(**{k: v.to(model.device) for k, v in t.items()})
|
132 |
+
# embeddings = model_output.last_hidden_state[:, 0, :]
|
133 |
+
# embeddings = torch.nn.functional.normalize(embeddings)
|
134 |
+
# return embeddings[0].cpu().numpy()
|
135 |
+
|
136 |
+
# @st.cache_data
|
137 |
+
# def load_faiss_index():
|
138 |
+
# books_embs = np.loadtxt('vectors.txt')
|
139 |
+
# index = faiss.IndexFlatIP(books_embs.shape[1])
|
140 |
+
# index.add(books_embs)
|
141 |
+
# return index
|
142 |
+
|
143 |
+
# st.title('Приложение для рекомендации книг')
|
144 |
+
|
145 |
+
# category_filter = st.selectbox('Выберите категорию книги (необязательно)', ['Все'] + list(df['category'].unique()))
|
146 |
+
|
147 |
+
# text = st.text_input('Введите запрос:')
|
148 |
+
# top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
149 |
+
|
150 |
+
# recommend_button = st.button('Найти')
|
151 |
+
|
152 |
+
# if text and recommend_button:
|
153 |
+
# query_emb = embed_bert_cls(text, model, tokenizer)
|
154 |
+
# index = load_faiss_index()
|
155 |
+
# D, I = index.search(query_emb.reshape(1, -1), top_n)
|
156 |
+
|
157 |
+
# st.subheader('Топ рекомендуемых книг:')
|
158 |
+
|
159 |
+
# for i, j in zip(I[0], D[0]):
|
160 |
+
# if category_filter == 'Все' or df['category'][i] == category_filter:
|
161 |
+
# col_1, col_2 = st.columns([1, 3])
|
162 |
+
|
163 |
+
# with col_1:
|
164 |
+
# st.image(df['image_url'][i], use_column_width=True)
|
165 |
+
# st.write(round(j, 2))
|
166 |
+
# with col_2:
|
167 |
+
# st.write(f'Название книги: **{df["title"][i]}**')
|
168 |
+
# st.write(f'Автор: {df["author"][i]}')
|
169 |
+
# st.write(f'Ссылка: {df["page_url"][i]}')
|
170 |
+
# st.write(f'Аннотация: {df["annotation"][i]}')
|
171 |
+
|
172 |
import streamlit as st
|
173 |
import pandas as pd
|
174 |
import numpy as np
|
175 |
import torch
|
176 |
from transformers import AutoTokenizer, AutoModel
|
|
|
177 |
import faiss
|
178 |
|
179 |
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
|
|
|
182 |
df = pd.read_csv('data_final.csv')
|
183 |
|
184 |
MAX_LEN = 300
|
185 |
+
@st.cache_data
|
186 |
+
def load_faiss_index():
|
187 |
+
books_embs = np.loadtxt('vectors.txt')
|
188 |
+
index = faiss.IndexFlatIP(books_embs.shape[1])
|
189 |
+
index.add(books_embs)
|
190 |
+
return index
|
191 |
|
192 |
def embed_bert_cls(text, model, tokenizer):
|
193 |
t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
|
|
|
197 |
embeddings = torch.nn.functional.normalize(embeddings)
|
198 |
return embeddings[0].cpu().numpy()
|
199 |
|
200 |
+
@st.cache_data()
|
201 |
+
def get_recommendations(query_emb, top_n):
|
202 |
+
index = load_faiss_index()
|
203 |
+
D, I = index.search(query_emb.reshape(1, -1), top_n)
|
204 |
+
recommendations = []
|
205 |
+
for i, j in zip(I[0], D[0]):
|
206 |
+
recommendation = {
|
207 |
+
'image_url': df['image_url'][i],
|
208 |
+
'title': df['title'][i],
|
209 |
+
'author': df['author'][i],
|
210 |
+
'page_url': df['page_url'][i],
|
211 |
+
'annotation': df['annotation'][i],
|
212 |
+
'category': df['category'][i],
|
213 |
+
'similarity_score': round(j, 2)
|
214 |
+
}
|
215 |
+
recommendations.append(recommendation)
|
216 |
+
return recommendations
|
217 |
|
218 |
st.title('Приложение для рекомендации книг')
|
219 |
|
220 |
+
category_filter = st.selectbox('Выберите категорию книги (необязательно)', ['Все'] + list(df['category'].unique()))
|
221 |
+
|
222 |
text = st.text_input('Введите запрос:')
|
223 |
top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
|
224 |
|
|
|
226 |
|
227 |
if text and recommend_button:
|
228 |
query_emb = embed_bert_cls(text, model, tokenizer)
|
229 |
+
recommendations = get_recommendations(query_emb, top_n)
|
230 |
|
231 |
+
if not recommendations: # Если рекомендации не найдены
|
232 |
+
st.write('По вашему запросу ничего не найдено.')
|
233 |
+
else:
|
234 |
+
st.subheader('Топ рекомендуемых книг:')
|
235 |
+
for recommendation in recommendations:
|
236 |
+
if category_filter == 'Все' or recommendation['category'] == category_filter:
|
237 |
+
col_1, col_2 = st.columns([1, 3])
|
238 |
+
with col_1:
|
239 |
+
st.image(recommendation['image_url'], use_column_width=True)
|
240 |
+
st.write(recommendation['similarity_score'])
|
241 |
+
with col_2:
|
242 |
+
st.write(f'Название книги: **{recommendation["title"]}**')
|
243 |
+
st.write(f'Автор: {recommendation["author"]}')
|
244 |
+
st.write(f'Ссылка: {recommendation["page_url"]}')
|
245 |
+
st.write(f'Аннотация: {recommendation["annotation"]}')
|
246 |
+
st.write(f'Категория: {recommendation["category"]}')
|
247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages/main.py
DELETED
@@ -1,19 +0,0 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
import pandas as pd
|
3 |
-
import random
|
4 |
-
|
5 |
-
# Load your dataset
|
6 |
-
# Replace 'your_dataset.csv' with the actual filename or path
|
7 |
-
df = pd.read_csv('data.csv')
|
8 |
-
|
9 |
-
# Function to display 10 random rows on button click
|
10 |
-
def show_random_rows():
|
11 |
-
random_rows = df.sample(10)[['author', 'title']]
|
12 |
-
st.table(random_rows)
|
13 |
-
|
14 |
-
# Streamlit app
|
15 |
-
st.title('Book recommender app')
|
16 |
-
|
17 |
-
# Button to trigger displaying random rows
|
18 |
-
if st.button('Show some books'):
|
19 |
-
show_random_rows()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|