Spaces:

valeriedaash
/

find_my_book

Sleeping

App Files Files Community

valeriedaash commited on Mar 15, 2024

Commit

a4cd013

1 Parent(s): 2d3924f

sentence updated

Browse files

Files changed (5) hide show

.gitattributes +1 -0
annotation_embeddings.txt +2 -2
data_prev.csv → data_sent.csv +2 -2
pages/sentence.py +50 -0
project.ipynb +31 -2

.gitattributes CHANGED Viewed

@@ -38,3 +38,4 @@ data_final.csv filter=lfs diff=lfs merge=lfs -text
 data_prev.csv filter=lfs diff=lfs merge=lfs -text
 vectors.txt filter=lfs diff=lfs merge=lfs -text
 annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text

 data_prev.csv filter=lfs diff=lfs merge=lfs -text
 vectors.txt filter=lfs diff=lfs merge=lfs -text
 annotation_embeddings.txt filter=lfs diff=lfs merge=lfs -text
+data_sent.csv filter=lfs diff=lfs merge=lfs -text

annotation_embeddings.txt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4e37ceafa4cd0505acb8549a022a00a5ab4554ca9a28d7b22daa5e4dc2faead
-size 298357487

 version https://git-lfs.github.com/spec/v1
+oid sha256:cb639307c4b84cd1f790e3bc872cbd78c7248df04deeb73bbe2969206170f406
+size 153369022

data_prev.csv → data_sent.csv RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeb6a01f238ec78db9bf76ec1a12356eaececd9162bd38d91d838934b9a908aa
-size 38974957

 version https://git-lfs.github.com/spec/v1
+oid sha256:7212f78b7f7cbdb168779a05e4ee7e522093f00fb8994d9d4689b2e11cc73f77
+size 30694340

pages/sentence.py CHANGED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+from sentence_transformers import SentenceTransformer
+import faiss
+from sklearn.preprocessing import normalize
+st.markdown('<div style="text-align: center; font-size: 24px;">Умный поиск книг с использованием SentenceTransformer (msmarco-distilbert-base-v4)</div>', unsafe_allow_html=True)
+df = pd.read_csv("data_final.csv")
+def load_embeddings_from_file(embedding_file):
+    embeddings = []
+    with open(embedding_file, "r") as f:
+        for line in f:
+            embedding = [float(x) for x in line.strip().split()]
+            embeddings.append(embedding)
+    embeddings = np.array(embeddings)
+    embeddings = normalize(embeddings)  # Нормализация эмбеддингов
+    return embeddings
+def search_similar_books(query, index, model, df, k):
+    query_embedding = model.encode([query])[0]
+    query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
+    faiss.normalize_L2(query_embedding)
+    distances, indices = index.search(query_embedding, k)
+    sorted_indices = indices[0][::-1]
+    sorted_distances = distances[0][::-1]
+    st.write("Результаты поиска для запроса '{}':".format(query))
+    for i, (annotation_index, similarity_score) in enumerate(zip(sorted_indices, sorted_distances)):
+        annotation = df.iloc[annotation_index]['annotation']
+        book_title = df.iloc[annotation_index]['title']
+        image_url = df.iloc[annotation_index]['image_url']
+        page_url = df.iloc[annotation_index]['page_url']
+        st.write("{}. Название книги: [{}]({})".format(i + 1, book_title, page_url))
+        st.image(image_url, caption='Обложка книги {}'.format(book_title))
+        st.write("   Аннотация: {}".format(annotation))
+        st.write("   Косинусное сходство: {:.4f}".format(similarity_score))
+embedding_file = "annotation_embeddings.txt"
+model = SentenceTransformer('msmarco-distilbert-base-v4')
+embeddings = load_embeddings_from_file(embedding_file)
+index = faiss.IndexFlatL2(embeddings.shape[1])  # Используем индекс для косинусного сходства
+index.add(embeddings)
+user_query = st.text_input("Введите ваш запрос:")
+num_books = st.slider("Выберите количество книг для рекомендации", 1, 10, 5)
+if st.button('Подобрать'):
+    search_similar_books(user_query, index, model, df, k=num_books)

project.ipynb CHANGED Viewed

@@ -421,12 +421,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
-    "df = pd.read_csv('data_final.csv')"
    ]
   },
   {

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
     "import pandas as pd\n",
+    "df = pd.read_csv('data_sent.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 17774 entries, 0 to 17773\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   page_url    17774 non-null  object\n",
+      " 1   image_url   17774 non-null  object\n",
+      " 2   author      17774 non-null  object\n",
+      " 3   title       17774 non-null  object\n",
+      " 4   annotation  17774 non-null  object\n",
+      " 5   category    17774 non-null  object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 833.3+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
    ]
   },
   {