Spaces:
Sleeping
Sleeping
Commit
·
a4cd013
1
Parent(s):
2d3924f
sentence updated
Browse files- .gitattributes +1 -0
- annotation_embeddings.txt +2 -2
- data_prev.csv → data_sent.csv +2 -2
- pages/sentence.py +50 -0
- project.ipynb +31 -2
.gitattributes
CHANGED
@@ -38,3 +38,4 @@ data_final.csv filter=lfs diff=lfs merge=lfs -text
|
|
38 |
data_prev.csv filter=lfs diff=lfs merge=lfs -text
|
39 |
vectors.txt filter=lfs diff=lfs merge=lfs -text
|
40 |
annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
38 |
data_prev.csv filter=lfs diff=lfs merge=lfs -text
|
39 |
vectors.txt filter=lfs diff=lfs merge=lfs -text
|
40 |
annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text
|
41 |
+
data_sent.csv filter=lfs diff=lfs merge=lfs -text
|
annotation_embeddings.txt
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cb639307c4b84cd1f790e3bc872cbd78c7248df04deeb73bbe2969206170f406
|
3 |
+
size 153369022
|
data_prev.csv → data_sent.csv
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
|
3 |
+
size 30694340
|
pages/sentence.py
CHANGED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
import faiss
|
6 |
+
from sklearn.preprocessing import normalize
|
7 |
+
st.markdown('<div style="text-align: center; font-size: 24px;">Умный поиск книг с использованием SentenceTransformer (msmarco-distilbert-base-v4)</div>', unsafe_allow_html=True)
|
8 |
+
|
9 |
+
df = pd.read_csv("data_final.csv")
|
10 |
+
|
11 |
+
def load_embeddings_from_file(embedding_file):
|
12 |
+
embeddings = []
|
13 |
+
with open(embedding_file, "r") as f:
|
14 |
+
for line in f:
|
15 |
+
embedding = [float(x) for x in line.strip().split()]
|
16 |
+
embeddings.append(embedding)
|
17 |
+
embeddings = np.array(embeddings)
|
18 |
+
embeddings = normalize(embeddings) # Нормализация эмбеддингов
|
19 |
+
return embeddings
|
20 |
+
|
21 |
+
|
22 |
+
def search_similar_books(query, index, model, df, k):
|
23 |
+
query_embedding = model.encode([query])[0]
|
24 |
+
query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
|
25 |
+
faiss.normalize_L2(query_embedding)
|
26 |
+
distances, indices = index.search(query_embedding, k)
|
27 |
+
sorted_indices = indices[0][::-1]
|
28 |
+
sorted_distances = distances[0][::-1]
|
29 |
+
st.write("Результаты поиска для запроса '{}':".format(query))
|
30 |
+
for i, (annotation_index, similarity_score) in enumerate(zip(sorted_indices, sorted_distances)):
|
31 |
+
annotation = df.iloc[annotation_index]['annotation']
|
32 |
+
book_title = df.iloc[annotation_index]['title']
|
33 |
+
image_url = df.iloc[annotation_index]['image_url']
|
34 |
+
page_url = df.iloc[annotation_index]['page_url']
|
35 |
+
|
36 |
+
st.write("{}. Название книги: [{}]({})".format(i + 1, book_title, page_url))
|
37 |
+
st.image(image_url, caption='Обложка книги {}'.format(book_title))
|
38 |
+
st.write(" Аннотация: {}".format(annotation))
|
39 |
+
st.write(" Косинусное сходство: {:.4f}".format(similarity_score))
|
40 |
+
|
41 |
+
|
42 |
+
embedding_file = "annotation_embeddings.txt"
|
43 |
+
model = SentenceTransformer('msmarco-distilbert-base-v4')
|
44 |
+
embeddings = load_embeddings_from_file(embedding_file)
|
45 |
+
index = faiss.IndexFlatL2(embeddings.shape[1]) # Используем индекс для косинусного сходства
|
46 |
+
index.add(embeddings)
|
47 |
+
user_query = st.text_input("Введите ваш запрос:")
|
48 |
+
num_books = st.slider("Выберите количество книг для рекомендации", 1, 10, 5)
|
49 |
+
if st.button('Подобрать'):
|
50 |
+
search_similar_books(user_query, index, model, df, k=num_books)
|
project.ipynb
CHANGED
@@ -421,12 +421,41 @@
|
|
421 |
},
|
422 |
{
|
423 |
"cell_type": "code",
|
424 |
-
"execution_count":
|
425 |
"metadata": {},
|
426 |
"outputs": [],
|
427 |
"source": [
|
428 |
"import pandas as pd\n",
|
429 |
-
"df = pd.read_csv('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
]
|
431 |
},
|
432 |
{
|
|
|
421 |
},
|
422 |
{
|
423 |
"cell_type": "code",
|
424 |
+
"execution_count": 1,
|
425 |
"metadata": {},
|
426 |
"outputs": [],
|
427 |
"source": [
|
428 |
"import pandas as pd\n",
|
429 |
+
"df = pd.read_csv('data_sent.csv')"
|
430 |
+
]
|
431 |
+
},
|
432 |
+
{
|
433 |
+
"cell_type": "code",
|
434 |
+
"execution_count": 2,
|
435 |
+
"metadata": {},
|
436 |
+
"outputs": [
|
437 |
+
{
|
438 |
+
"name": "stdout",
|
439 |
+
"output_type": "stream",
|
440 |
+
"text": [
|
441 |
+
"<class 'pandas.core.frame.DataFrame'>\n",
|
442 |
+
"RangeIndex: 17774 entries, 0 to 17773\n",
|
443 |
+
"Data columns (total 6 columns):\n",
|
444 |
+
" # Column Non-Null Count Dtype \n",
|
445 |
+
"--- ------ -------------- ----- \n",
|
446 |
+
" 0 page_url 17774 non-null object\n",
|
447 |
+
" 1 image_url 17774 non-null object\n",
|
448 |
+
" 2 author 17774 non-null object\n",
|
449 |
+
" 3 title 17774 non-null object\n",
|
450 |
+
" 4 annotation 17774 non-null object\n",
|
451 |
+
" 5 category 17774 non-null object\n",
|
452 |
+
"dtypes: object(6)\n",
|
453 |
+
"memory usage: 833.3+ KB\n"
|
454 |
+
]
|
455 |
+
}
|
456 |
+
],
|
457 |
+
"source": [
|
458 |
+
"df.info()"
|
459 |
]
|
460 |
},
|
461 |
{
|