import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer import faiss from sklearn.preprocessing import normalize st.markdown('
Умный поиск книг с использованием SentenceTransformer (msmarco-distilbert-base-v4)
', unsafe_allow_html=True) df = pd.read_csv("data_final.csv") def load_embeddings_from_file(embedding_file): embeddings = [] with open(embedding_file, "r") as f: for line in f: embedding = [float(x) for x in line.strip().split()] embeddings.append(embedding) embeddings = np.array(embeddings) embeddings = normalize(embeddings) # Нормализация эмбеддингов return embeddings def search_similar_books(query, index, model, df, k): query_embedding = model.encode([query])[0] query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1) faiss.normalize_L2(query_embedding) distances, indices = index.search(query_embedding, k) sorted_indices = indices[0][::-1] sorted_distances = distances[0][::-1] st.write("Результаты поиска для запроса '{}':".format(query)) for i, (annotation_index, similarity_score) in enumerate(zip(sorted_indices, sorted_distances)): annotation = df.iloc[annotation_index]['annotation'] book_title = df.iloc[annotation_index]['title'] image_url = df.iloc[annotation_index]['image_url'] page_url = df.iloc[annotation_index]['page_url'] st.write("{}. Название книги: [{}]({})".format(i + 1, book_title, page_url)) st.image(image_url, caption='Обложка книги {}'.format(book_title)) st.write(" Аннотация: {}".format(annotation)) st.write(" Косинусное сходство: {:.4f}".format(similarity_score)) embedding_file = "annotation_embeddings.txt" model = SentenceTransformer('msmarco-distilbert-base-v4') embeddings = load_embeddings_from_file(embedding_file) index = faiss.IndexFlatL2(embeddings.shape[1]) # Используем индекс для косинусного сходства index.add(embeddings) user_query = st.text_input("Введите ваш запрос:") num_books = st.slider("Выберите количество книг для рекомендации", 1, 10, 5) if st.button('Подобрать'): search_similar_books(user_query, index, model, df, k=num_books)