Spaces:

kaisugi
/

academic-paraphraser

Runtime error

App Files Files Community

kaisugi commited on Feb 9, 2023

Commit

b6363d9

1 Parent(s): 05f1914

update

Browse files

Files changed (1) hide show

app.py +30 -36

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import faiss
 import numpy as np
 import pandas as pd
 import streamlit as st
 import torch
-from transformers import AutoModel, AutoTokenizer
 import os
@@ -34,38 +34,7 @@ def load_sentence_embeddings():
     return sentence_embeddings
-@st.cache(allow_output_mutation=True)
-def build_faiss_index(sentence_emeddings):
-    D = 768
-    N = 789188
-    Xt = sentence_emeddings[:39000]
-    X = sentence_emeddings
-    # Param of PQ
-    M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
-    nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
-    # Param of IVF
-    nlist = 1000  # The number of cells (space partition). Typical value is sqrt(N)
-    # Param of HNSW
-    hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
-    # Setup
-    quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
-    index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)
-    # Train
-    index.train(Xt)
-    # Add
-    index.add(X)
-    # Search
-    index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
-    return index
-@st.cache(allow_output_mutation=True)
 def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
@@ -102,9 +71,34 @@ def main(model, tokenizer, sentence_df, index):
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer()
     sentence_df = load_sentence_data()
-    sentence_emeddings = load_sentence_embeddings()
-    faiss.normalize_L2(sentence_emeddings)
-    index = build_faiss_index(sentence_emeddings)
     main(model, tokenizer, sentence_df, index)

+from transformers import AutoModel, AutoTokenizer
 import faiss
 import numpy as np
 import pandas as pd
 import streamlit as st
 import torch
 import os
     return sentence_embeddings
+@st.cache
 def get_retrieval_results(index, input_text, top_k, model, tokenizer, sentence_df):
     with torch.no_grad():
         inputs = tokenizer.encode_plus(
 if __name__ == "__main__":
     model, tokenizer = load_model_and_tokenizer()
     sentence_df = load_sentence_data()
+    sentence_embeddings = load_sentence_embeddings()
+    faiss.normalize_L2(sentence_embeddings)
+    D = 768
+    N = 789188
+    Xt = sentence_embeddings[:39000]
+    X = sentence_embeddings
+    # Param of PQ
+    M = 16  # The number of sub-vector. Typically this is 8, 16, 32, etc.
+    nbits = 8 # bits per sub-vector. This is typically 8, so that each sub-vec is encoded by 1 byte
+    # Param of IVF
+    nlist = 1000  # The number of cells (space partition). Typical value is sqrt(N)
+    # Param of HNSW
+    hnsw_m = 32  # The number of neighbors for HNSW. This is typically 32
+    # Setup
+    quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
+    index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits)
+    # Train
+    index.train(Xt)
+    # Add
+    index.add(X)
+    # Search
+    index.nprobe = 8  # Runtime param. The number of cells that are visited for search.
     main(model, tokenizer, sentence_df, index)