Spaces:

valeriedaash
/

find_my_book

Sleeping

App Files Files Community

valeriedaash commited on Mar 14, 2024

Commit

eb0c31c

1 Parent(s): f57ed01

app added

Browse files

Files changed (8) hide show

.gitattributes +3 -0
app.py +47 -13
data.csv +0 -0
data_final.csv +3 -0
data_prev.csv +3 -0
embeddings.txt +3 -0
pages/main.py +19 -0
project.ipynb +453 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+embeddings.txt filter=lfs diff=lfs merge=lfs -text
+data_final.csv filter=lfs diff=lfs merge=lfs -text
+data_prev.csv filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,19 +1,53 @@
 import streamlit as st
 import pandas as pd
-import random
-# Load your dataset
-# Replace 'your_dataset.csv' with the actual filename or path
-df = pd.read_csv('data.csv')
-# Function to display 10 random rows on button click
-def show_random_rows():
-    random_rows = df.sample(10)[['author', 'title']]
-    st.table(random_rows)
-# Streamlit app
-st.title('Book recommender app')
-# Button to trigger displaying random rows
-if st.button('Show some books'):
-    show_random_rows()

 import streamlit as st
 import pandas as pd
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
+import faiss
+tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
+model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")
+df = pd.read_csv('data_final.csv')
+MAX_LEN = 300
+def embed_bert_cls(text, model, tokenizer):
+    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt', max_length=MAX_LEN)
+    with torch.no_grad():
+        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
+    embeddings = model_output.last_hidden_state[:, 0, :]
+    embeddings = torch.nn.functional.normalize(embeddings)
+    return embeddings[0].cpu().numpy()
+books_embs = np.loadtxt('embeddings.txt')
+index = faiss.IndexFlatIP(books_embs.shape[1])
+index.add(books_embs)
+st.title('Приложение для рекомендации книг')
+text = st.text_input('Введите запрос:')
+top_n = st.number_input('Введите количество рекомендаций:', min_value=1, max_value=50, value=1)
+recommend_button = st.button('Найти')
+if text and recommend_button:
+    query_emb = embed_bert_cls(text, model, tokenizer)
+    D, I = index.search(query_emb.reshape(1, -1), top_n)
+    st.subheader('Топ рекомендуемых книг:')
+    for i, j in zip(I[0], D[0]):
+        col_1, col_2 = st.columns([1, 3])
+        with col_1:
+            st.image(df['image_url'][i], use_column_width=True)
+            st.write(round(j* 100, 2))
+        with col_2:
+            st.write(f'Название книги: {df["title"][i]}')
+            st.write(f'Автор: {df["author"][i]}')
+            st.write(f'Ссылка: {df["page_url"][i]}')
+            st.write(f'Аннотация: {df["annotation"][i]}')

data.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

data_final.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6656361b1e4b9e9a9200fe464ca227c0ce285af2150975072f48e548a0fbd1b7
+size 32055320

data_prev.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb6a01f238ec78db9bf76ec1a12356eaececd9162bd38d91d838934b9a908aa
+size 38974957

embeddings.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:631d7269f12874f12a6e7a732f431f75caeb00c16421234c5b07b7a814be113b
+size 110210970

pages/main.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import streamlit as st
+import pandas as pd
+import random
+# Load your dataset
+# Replace 'your_dataset.csv' with the actual filename or path
+df = pd.read_csv('data.csv')
+# Function to display 10 random rows on button click
+def show_random_rows():
+    random_rows = df.sample(10)[['author', 'title']]
+    st.table(random_rows)
+# Streamlit app
+st.title('Book recommender app')
+# Button to trigger displaying random rows
+if st.button('Show some books'):
+    show_random_rows()

project.ipynb ADDED Viewed

	@@ -0,0 +1,453 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df0 = pd.read_csv('data_prev.csv')\n",
+    "df1 = pd.read_csv('data7.csv')\n",
+    "df2 = pd.read_csv('data6.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.concat([df0, df1, df2], ignore_index=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "page_url      0\n",
+       "image_url     0\n",
+       "author        2\n",
+       "title         0\n",
+       "annotation    0\n",
+       "category      0\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.isna().sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.dropna(subset=['annotation', 'author'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 24385 entries, 0 to 24386\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   page_url    24385 non-null  object\n",
+      " 1   image_url   24385 non-null  object\n",
+      " 2   author      24385 non-null  object\n",
+      " 3   title       24385 non-null  object\n",
+      " 4   annotation  24385 non-null  object\n",
+      " 5   category    24385 non-null  object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 1.3+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop_duplicates(subset=['author', 'title'], inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 24235 entries, 0 to 24385\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   page_url    24235 non-null  object\n",
+      " 1   image_url   24235 non-null  object\n",
+      " 2   author      24235 non-null  object\n",
+      " 3   title       24235 non-null  object\n",
+      " 4   annotation  24235 non-null  object\n",
+      " 5   category    24235 non-null  object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 1.3+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0                                           Евгений Онегин\n",
+       "1        Мастер и Маргарита (с иллюстрациями балета Сер...\n",
+       "2                            Сто лет одиночества : [роман]\n",
+       "3                     О дивный новый мир (замена картинки)\n",
+       "4                            Дюна: [фантастический роман].\n",
+       "                               ...                        \n",
+       "24381                                         Игра в бисер\n",
+       "24382                                    Ромео и Джульетта\n",
+       "24383                                Опасные связи : роман\n",
+       "24384                                Тартарен из Тараскона\n",
+       "24385                                 Станция на горизонте\n",
+       "Name: title, Length: 24235, dtype: object"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['title']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def has_partial_start(row, col_name):\n",
+    "    name = row[col_name]\n",
+    "    for other_name in df[col_name]:\n",
+    "        if other_name != name and other_name.startswith(name):\n",
+    "            return True\n",
+    "    return False\n",
+    "\n",
+    "# def filter_partial_start(row, col_name):\n",
+    "#     name = row[col_name]\n",
+    "#     for other_name in df[col_name]:\n",
+    "#         if other_name != name and other_name.startswith(name):\n",
+    "#             return False\n",
+    "#     return True\n",
+    "\n",
+    "# Применение функции для удаления строк с частичным совпадением в начале\n",
+    "filt_df = df[~df.apply(lambda x: has_partial_start(x, 'title'), axis=1)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.set_option('display.max_rows', None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "category\n",
+       "Социология                                                                                                                                            3802\n",
+       "Фантастика. Фэнтези                                                                                                                                   2731\n",
+       "Классическая и современная проза                                                                                                                      2265\n",
+       "Детективы                                                                                                                                             1834\n",
+       "Эзотерическая литература                                                                                                                              1133\n",
+       "Клиническая медицина. Внутренние болезни                                                                                                               640\n",
+       "Популярная психология                                                                                                                                  603\n",
+       "Бизнес. Торговля                                                                                                                                       574\n",
+       "Фольклор                                                                                                                                               542\n",
+       "Философия                                                                                                                                              520\n",
+       "Филология                                                                                                                                              485\n",
+       "Испанский, португальский, итальянский языки                                                                                                            465\n",
+       "Религия                                                                                                                                                461\n",
+       "Маркетинг                                                                                                                                              441\n",
+       "Публицистика. Биографии. Мемуары                                                                                                                       440\n",
+       "Науки о Земле                                                                                                                                          330\n",
+       "Биологические науки                                                                                                                                    322\n",
+       "прочие языки                                                                                                                                           322\n",
+       "Книги по программированию                                                                                                                              319\n",
+       "Политика. Партии и движения                                                                                                                            277\n",
+       "Любовные романы. Книги о любви                                                                                                                         275\n",
+       "Исторические романы                                                                                                                                    267\n",
+       "Поэзия. Драматургия                                                                                                                                    245\n",
+       "Естественные науки в целом.  Науковедение                                                                                                              219\n",
+       "Народная медицина. Нетрадиционные методы лечения                                                                                                       200\n",
+       "Математика                                                                                                                                             196\n",
+       "Автоматика. Радиоэлектроника. Связь                                                                                                                    188\n",
+       "Энергетика. Машиностроение. Приборостроение                                                                                                            179\n",
+       "Физика                                                                                                                                                 147\n",
+       "Транспорт                                                                                                                                              138\n",
+       "Военная наука.  Военное дело                                                                                                                           133\n",
+       "Детская психология                                                                                                                                     116\n",
+       "Приключения                                                                                                                                            110\n",
+       "Защита информации.  Компьютерная безопасность                                                                                                          104\n",
+       "Общая патология. Общая терапия                                                                                                                         103\n",
+       "Домашние животные                                                                                                                                      101\n",
+       "Хирургия. Онкология. Прикладные отрасли медицины                                                                                                       101\n",
+       "Здравоохранение. Гигиена. Эпидемиология                                                                                                                 98\n",
+       "Химические науки                                                                                                                                        97\n",
+       "Прикладное программное обеспечение                                                                                                                      96\n",
+       "Педиатрия                                                                                                                                               90\n",
+       "Астрономия                                                                                                                                              87\n",
+       "Строительство                                                                                                                                           76\n",
+       "Горная промышленность. Металлургия                                                                                                                      73\n",
+       "Психологические школы и направления. Психоанализ                                                                                                        72\n",
+       "Ветеринария                                                                                                                                             71\n",
+       "Курортология. Физиотерапия. Лечебная физкультура                                                                                                        66\n",
+       "Химическая промышленность. Пищевая промышленность.  Легкая промышленность. Деревообрабатывающая и   целлюлозно-бумажная промышленность. Полиграфия      59\n",
+       "Экология                                                                                                                                                58\n",
+       "Психология общения                                                                                                                                      55\n",
+       "Базы и банки данных. СУБД                                                                                                                               39\n",
+       "Общая психология. История психологии                                                                                                                    37\n",
+       "Технические науки и промышленность в целом                                                                                                              37\n",
+       "Операционные системы и программы-оболочки                                                                                                               33\n",
+       "Отдельные философские учения                                                                                                                            30\n",
+       "Компьютерные сети. Интернет                                                                                                                             29\n",
+       "Механика                                                                                                                                                27\n",
+       "Фармакология. Лекарствоведение. Токсикология                                                                                                            24\n",
+       "Прикладная психология. Соционика. Педагогическая психология                                                                                             11\n",
+       "Статистика. Демография                                                                                                                                   9\n",
+       "Мобильные устройства                                                                                                                                     6\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filt_df['category'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 1627 entries, 0 to 24382\n",
+      "Data columns (total 6 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   page_url    1627 non-null   object\n",
+      " 1   image_url   1627 non-null   object\n",
+      " 2   author      1627 non-null   object\n",
+      " 3   title       1627 non-null   object\n",
+      " 4   annotation  1627 non-null   object\n",
+      " 5   category    1627 non-null   object\n",
+      "dtypes: object(6)\n",
+      "memory usage: 89.0+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0                  Евгений Онегин\n",
+       "8                            Дюна\n",
+       "9        Малое собрание сочинений\n",
+       "13             Мастер и Маргарита\n",
+       "22                    Бедная Лиза\n",
+       "                   ...           \n",
+       "24339             Остров сокровищ\n",
+       "24348        Дети капитана Гранта\n",
+       "24379                  Милый друг\n",
+       "24380                     Саломея\n",
+       "24382           Ромео и Джульетта\n",
+       "Name: title, Length: 1627, dtype: object"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['title']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Фильтрация категорий\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "category_counts = df['category'].value_counts()\n",
+    "\n",
+    "# Фильтрация категорий с количеством значений больше 100\n",
+    "filtered_categories = category_counts[category_counts > 500].index\n",
+    "\n",
+    "# Фильтрация DataFrame по отфильтрованным категориям\n",
+    "filtered_df = df[df['category'].isin(filtered_categories)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "category\n",
+       "Социология                                  3954\n",
+       "Фантастика. Фэнтези                         2916\n",
+       "Классическая и современная проза            2716\n",
+       "Детективы                                   1979\n",
+       "Эзотерическая литература                    1175\n",
+       "Клиническая медицина. Внутренние болезни     647\n",
+       "Популярная психология                        629\n",
+       "Фольклор                                     613\n",
+       "Философия                                    598\n",
+       "Бизнес. Торговля                             588\n",
+       "Филология                                    503\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filtered_df['category'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categories_list = ['Классическая и современная проза', 'Детективы', 'Фантастика. Фэнтези', 'Любовные романы. Книги о любви', 'Поэзия. Драматургия', 'Общая патология. Общая терапия', 'Народная медицина. Нетрадиционные методы лечения', 'Клиническая медицина. Внутренние болезни', 'Ветеринария', 'Домашние животные', 'Публицистика. Биографии. Мемуары', 'Эзотерическая литература', 'Философия', 'Религия', 'Социология', 'Фольклор', 'Бизнес. Торговля', 'Маркетинг', 'Книги по программированию', 'Политика. Партии и движения', 'Филология', 'Математика', 'Физика', 'Астрономия', 'Науки о Земле', 'Экология', 'Биологические науки', 'Популярная психология', 'Психологические школы и направления. Психоанализ', 'Детская психология', 'Приключения', 'Исторические романы']\n",
+    "filtered_df = filt_df[filt_df['category'].isin(categories_list)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filtered_df.to_csv('data_final.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df.to_csv('data_final.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# filtered_df.to_csv('filtered_data_without_dubs.csv', index=False) "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "faiss_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}