valeriedaash commited on
Commit
a4cd013
·
1 Parent(s): 2d3924f

sentence updated

Browse files
.gitattributes CHANGED
@@ -38,3 +38,4 @@ data_final.csv filter=lfs diff=lfs merge=lfs -text
38
  data_prev.csv filter=lfs diff=lfs merge=lfs -text
39
  vectors.txt filter=lfs diff=lfs merge=lfs -text
40
  annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text
 
 
38
  data_prev.csv filter=lfs diff=lfs merge=lfs -text
39
  vectors.txt filter=lfs diff=lfs merge=lfs -text
40
  annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text
41
+ data_sent.csv filter=lfs diff=lfs merge=lfs -text
annotation_embeddings.txt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4e37ceafa4cd0505acb8549a022a00a5ab4554ca9a28d7b22daa5e4dc2faead
3
- size 298357487
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb639307c4b84cd1f790e3bc872cbd78c7248df04deeb73bbe2969206170f406
3
+ size 153369022
data_prev.csv → data_sent.csv RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aeb6a01f238ec78db9bf76ec1a12356eaececd9162bd38d91d838934b9a908aa
3
- size 38974957
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
3
+ size 30694340
pages/sentence.py CHANGED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ import faiss
6
+ from sklearn.preprocessing import normalize
7
+ st.markdown('<div style="text-align: center; font-size: 24px;">Умный поиск книг с использованием SentenceTransformer (msmarco-distilbert-base-v4)</div>', unsafe_allow_html=True)
8
+
9
+ df = pd.read_csv("data_final.csv")
10
+
11
+ def load_embeddings_from_file(embedding_file):
12
+ embeddings = []
13
+ with open(embedding_file, "r") as f:
14
+ for line in f:
15
+ embedding = [float(x) for x in line.strip().split()]
16
+ embeddings.append(embedding)
17
+ embeddings = np.array(embeddings)
18
+ embeddings = normalize(embeddings) # Нормализация эмбеддингов
19
+ return embeddings
20
+
21
+
22
+ def search_similar_books(query, index, model, df, k):
23
+ query_embedding = model.encode([query])[0]
24
+ query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
25
+ faiss.normalize_L2(query_embedding)
26
+ distances, indices = index.search(query_embedding, k)
27
+ sorted_indices = indices[0][::-1]
28
+ sorted_distances = distances[0][::-1]
29
+ st.write("Результаты поиска для запроса '{}':".format(query))
30
+ for i, (annotation_index, similarity_score) in enumerate(zip(sorted_indices, sorted_distances)):
31
+ annotation = df.iloc[annotation_index]['annotation']
32
+ book_title = df.iloc[annotation_index]['title']
33
+ image_url = df.iloc[annotation_index]['image_url']
34
+ page_url = df.iloc[annotation_index]['page_url']
35
+
36
+ st.write("{}. Название книги: [{}]({})".format(i + 1, book_title, page_url))
37
+ st.image(image_url, caption='Обложка книги {}'.format(book_title))
38
+ st.write(" Аннотация: {}".format(annotation))
39
+ st.write(" Косинусное сходство: {:.4f}".format(similarity_score))
40
+
41
+
42
+ embedding_file = "annotation_embeddings.txt"
43
+ model = SentenceTransformer('msmarco-distilbert-base-v4')
44
+ embeddings = load_embeddings_from_file(embedding_file)
45
+ index = faiss.IndexFlatL2(embeddings.shape[1]) # Используем индекс для косинусного сходства
46
+ index.add(embeddings)
47
+ user_query = st.text_input("Введите ваш запрос:")
48
+ num_books = st.slider("Выберите количество книг для рекомендации", 1, 10, 5)
49
+ if st.button('Подобрать'):
50
+ search_similar_books(user_query, index, model, df, k=num_books)
project.ipynb CHANGED
@@ -421,12 +421,41 @@
421
  },
422
  {
423
  "cell_type": "code",
424
- "execution_count": 2,
425
  "metadata": {},
426
  "outputs": [],
427
  "source": [
428
  "import pandas as pd\n",
429
- "df = pd.read_csv('data_final.csv')"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
  ]
431
  },
432
  {
 
421
  },
422
  {
423
  "cell_type": "code",
424
+ "execution_count": 1,
425
  "metadata": {},
426
  "outputs": [],
427
  "source": [
428
  "import pandas as pd\n",
429
+ "df = pd.read_csv('data_sent.csv')"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": 2,
435
+ "metadata": {},
436
+ "outputs": [
437
+ {
438
+ "name": "stdout",
439
+ "output_type": "stream",
440
+ "text": [
441
+ "<class 'pandas.core.frame.DataFrame'>\n",
442
+ "RangeIndex: 17774 entries, 0 to 17773\n",
443
+ "Data columns (total 6 columns):\n",
444
+ " # Column Non-Null Count Dtype \n",
445
+ "--- ------ -------------- ----- \n",
446
+ " 0 page_url 17774 non-null object\n",
447
+ " 1 image_url 17774 non-null object\n",
448
+ " 2 author 17774 non-null object\n",
449
+ " 3 title 17774 non-null object\n",
450
+ " 4 annotation 17774 non-null object\n",
451
+ " 5 category 17774 non-null object\n",
452
+ "dtypes: object(6)\n",
453
+ "memory usage: 833.3+ KB\n"
454
+ ]
455
+ }
456
+ ],
457
+ "source": [
458
+ "df.info()"
459
  ]
460
  },
461
  {