Spaces:

MARI-posa
/

FindMyBook

Runtime error

App Files Files Community

MARI-posa commited on Jun 15, 2023

Commit

e79c988

•

1 Parent(s): 8e3efa7

Update stri.py

Browse files

Files changed (1) hide show

stri.py +22 -25

stri.py CHANGED Viewed

@@ -22,55 +22,52 @@ books.drop_duplicates(subset='title', keep='first', inplace=True)
 books.reset_index(drop=True)
 def data_preprocessing(text: str) -> str:
-    text = re.sub(r'http\S+', " ", text) # удаляем ссылки
-    text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей
-    text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги
-#     text = re.sub(r'\d+', ' ', text) # удаляем числа
-#     text = text.translate(str.maketrans('', '', string.punctuation))
-    text = re.sub(r'<.*?>',' ', text) # html tags
-    return
-for i in ['author',	'title',	'annotation']:
     books[i] = books[i].apply(data_preprocessing)
 annot = books['annotation']
 # Получение эмбеддингов аннотаций каждой книги в датасете
 max_len = 128
-token_annot = annot.apply(lambda x: tokenizer.encode(x, add_special_tokens=True, \
-                                                   truncation=True, max_length=max_len))
-padded = np.array([i + [0]*(max_len-len(i)) for i in token_annot.values])  # заполним недостающую длину нулями
-attention_mask = np.where(padded != 0, 1, 0)   # создадим маску, отметим где есть значения а где пустота
 # Переведем numpy массивы в тензоры PyTorch
 input_ids = torch.tensor(padded, dtype=torch.long)
 attention_mask = torch.tensor(attention_mask, dtype=torch.long)
 book_embeddings = []
 for inputs, attention_masks in zip(input_ids, attention_mask):
-    with torch.inference_mode():
         book_embedding = model(inputs.unsqueeze(0), attention_mask=attention_masks.unsqueeze(0))
-        book_embedding = book_embedding[0][:,0,:] #.detach().cpu().numpy()
         book_embeddings.append(np.squeeze(book_embedding))
 # Определение запроса пользователя
 query = st.text_input("Введите запрос")
-query_tokens = tokenizer.encode(query, add_special_tokens=True, \
-                                                   truncation=True, max_length=max_len)
-query_padded = np.array(query_tokens + [0]*(max_len-len(query_tokens)))
 query_mask = np.where(query_padded != 0, 1, 0)
 # Переведем numpy массивы в тензоры PyTorch
 query_padded = torch.tensor(query_padded, dtype=torch.long)
 query_mask = torch.tensor(query_mask, dtype=torch.long)
-with torch.inference_mode():
-    query_embedding = model(query_padded.unsqueeze(0), query_mask.unsqueeze(0))      #[0].squeeze()
-    query_embedding = query_embedding[0][:,0,:] #.detach().cpu().numpy()
 # Вычисление косинусного расстояния между эмбеддингом запроса и каждой аннотацией
 cosine_similarities = torch.nn.functional.cosine_similarity(
@@ -80,7 +77,7 @@ cosine_similarities = torch.nn.functional.cosine_similarity(
 cosine_similarities = cosine_similarities.numpy()
-indices = np.argsort(cosine_similarities)[::-1] # Сортировка по убыванию
 for i in indices[:10]:
-    st.write(books['title'][i])

 books.reset_index(drop=True)
 def data_preprocessing(text: str) -> str:
+    text = re.sub(r'http\S+', " ", text)  # удаляем ссылки
+    text = re.sub(r'@\w+', ' ', text)  # удаляем упоминания пользователей
+    text = re.sub(r'#\w+', ' ', text)  # удаляем хэштеги
+    text = re.sub(r'<.*?>', ' ', text)  # html tags
+    return text
+for i in ['author', 'title', 'annotation']:
     books[i] = books[i].apply(data_preprocessing)
 annot = books['annotation']
 # Получение эмбеддингов аннотаций каждой книги в датасете
 max_len = 128
+token_annot = annot.apply(lambda x: tokenizer.encode(x, add_special_tokens=True,
+                                                     truncation=True, max_length=max_len))
+padded = np.array([i + [0] * (max_len - len(i)) for i in token_annot.values])  # заполним недостающую длину нулями
+attention_mask = np.where(padded != 0, 1, 0)  # создадим маску, отметим где есть значения а где пустота
 # Переведем numpy массивы в тензоры PyTorch
 input_ids = torch.tensor(padded, dtype=torch.long)
 attention_mask = torch.tensor(attention_mask, dtype=torch.long)
 book_embeddings = []
 for inputs, attention_masks in zip(input_ids, attention_mask):
+    with torch.no_grad():
         book_embedding = model(inputs.unsqueeze(0), attention_mask=attention_masks.unsqueeze(0))
+        book_embedding = book_embedding[0][:, 0, :].detach().cpu().numpy()
         book_embeddings.append(np.squeeze(book_embedding))
 # Определение запроса пользователя
 query = st.text_input("Введите запрос")
+query_tokens = tokenizer.encode(query, add_special_tokens=True,
+                                truncation=True, max_length=max_len)
+query_padded = np.array(query_tokens + [0] * (max_len - len(query_tokens)))
 query_mask = np.where(query_padded != 0, 1, 0)
 # Переведем numpy массивы в тензоры PyTorch
 query_padded = torch.tensor(query_padded, dtype=torch.long)
 query_mask = torch.tensor(query_mask, dtype=torch.long)
+with torch.no_grad():
+    query_embedding = model(query_padded.unsqueeze(0), query_mask.unsqueeze(0))
+    query_embedding = query_embedding[0][:, 0, :].detach().cpu().numpy()
 # Вычисление косинусного расстояния между эмбеддингом запроса и каждой аннотацией
 cosine_similarities = torch.nn.functional.cosine_similarity(
 cosine_similarities = cosine_similarities.numpy()
+indices = np.argsort(cosine_similarities)[::-1]  # Сортировка по убыванию
 for i in indices[:10]:
+    st.write(books['title'][i])