bhlewis commited on
Commit
c884348
·
verified ·
1 Parent(s): 1f521fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -26
app.py CHANGED
@@ -6,7 +6,9 @@ import json
6
  from transformers import AutoTokenizer, AutoModel
7
  from sentence_transformers import SentenceTransformer, models
8
  from sklearn.feature_extraction.text import TfidfVectorizer
 
9
  import re
 
10
  import spacy
11
  import joblib
12
 
@@ -50,7 +52,9 @@ embeddings, patent_numbers, metadata, texts = load_data()
50
  try:
51
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
52
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
53
- model = SentenceTransformer(modules=[models.Transformer(model_name='anferico/bert-for-patents'), models.Pooling(bert_model.config.hidden_size)])
 
 
54
  except Exception as e:
55
  print(f"Error loading anferico/bert-for-patents: {e}")
56
  print("Falling back to a general-purpose model.")
@@ -99,28 +103,4 @@ def hybrid_search(query, top_k=5):
99
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
100
 
101
  # Perform semantic similarity search
102
- semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k)
103
-
104
- results = []
105
- for i in range(top_k):
106
- patent_number = patent_numbers[semantic_indices[0][i]]
107
- patent_data = metadata[patent_number]
108
- patent_features = extract_key_features(patent_data['text'])
109
- common_features, similarity_score = compare_features(query_features, patent_features)
110
-
111
- results.append({
112
- 'patent_number': patent_number,
113
- 'common_features': common_features,
114
- 'similarity_score': similarity_score,
115
- 'semantic_score': semantic_distances[0][i]
116
- })
117
-
118
- return results
119
-
120
- iface = gr.Interface(
121
- fn=hybrid_search,
122
- inputs=gr.inputs.Textbox(label="Enter your search query"),
123
- outputs=gr.outputs.JSON(label="Search Results")
124
- )
125
-
126
- iface.launch()
 
6
  from transformers import AutoTokenizer, AutoModel
7
  from sentence_transformers import SentenceTransformer, models
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
  import re
11
+ from collections import Counter
12
  import spacy
13
  import joblib
14
 
 
52
  try:
53
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
54
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
55
+ word_embedding_model = models.Transformer(model_name='anferico/bert-for-patents', tokenizer=tokenizer, model=bert_model)
56
+ pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
57
+ model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
58
  except Exception as e:
59
  print(f"Error loading anferico/bert-for-patents: {e}")
60
  print("Falling back to a general-purpose model.")
 
103
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
104
 
105
  # Perform semantic similarity search
106
+ semantic_distances, semantic