bhlewis commited on
Commit
1f521fb
·
verified ·
1 Parent(s): 97ee057

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -55
app.py CHANGED
@@ -6,10 +6,9 @@ import json
6
  from transformers import AutoTokenizer, AutoModel
7
  from sentence_transformers import SentenceTransformer, models
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
  import re
11
- from collections import Counter
12
  import spacy
 
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
@@ -51,9 +50,7 @@ embeddings, patent_numbers, metadata, texts = load_data()
51
  try:
52
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
53
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
54
- word_embedding_model = models.Transformer(model_name='anferico/bert-for-patents', tokenizer=tokenizer, model=bert_model)
55
- pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
56
- model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
57
  except Exception as e:
58
  print(f"Error loading anferico/bert-for-patents: {e}")
59
  print("Falling back to a general-purpose model.")
@@ -73,9 +70,10 @@ embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
73
  index = faiss.IndexFlatIP(embeddings.shape[1])
74
  index.add(embeddings)
75
 
76
- # Create TF-IDF vectorizer
77
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
78
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
 
79
 
80
  def extract_key_features(text):
81
  # Use Spacy to extract noun phrases and key phrases
@@ -101,59 +99,28 @@ def hybrid_search(query, top_k=5):
101
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
102
 
103
  # Perform semantic similarity search
104
- semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
105
-
106
- # Perform TF-IDF based search
107
- query_tfidf = tfidf_vectorizer.transform([query])
108
- tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
109
- tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
110
-
111
- # Combine and rank results
112
- combined_results = {}
113
- for i, idx in enumerate(semantic_indices[0]):
114
- patent_number = patent_numbers[idx].decode('utf-8')
115
- text = metadata[patent_number]['text']
116
- patent_features = extract_key_features(text)
117
- common_features, feature_similarity = compare_features(query_features, patent_features)
118
- combined_results[patent_number] = {
119
- 'score': semantic_distances[0][i] * 1.5 + feature_similarity,
120
- 'common_features': common_features,
121
- 'text': text
122
- }
123
-
124
- for idx in tfidf_indices:
125
- patent_number = patent_numbers[idx].decode('utf-8')
126
- if patent_number not in combined_results:
127
- text = metadata[patent_number]['text']
128
- patent_features = extract_key_features(text)
129
- common_features, feature_similarity = compare_features(query_features, patent_features)
130
- combined_results[patent_number] = {
131
- 'score': tfidf_similarities[idx] + feature_similarity,
132
- 'common_features': common_features,
133
- 'text': text
134
- }
135
-
136
- # Sort and get top results
137
- top_results = sorted(combined_results.items(), key=lambda x: x[1]['score'], reverse=True)[:top_k]
138
 
139
  results = []
140
- for patent_number, data in top_results:
141
- result = f"Patent Number: {patent_number}\n"
142
- result += f"Text: {data['text'][:200]}...\n"
143
- result += f"Combined Score: {data['score']:.4f}\n"
144
- result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
145
- results.append(result)
146
-
147
- return "\n".join(results)
 
 
 
 
 
 
148
 
149
- # Create Gradio interface
150
  iface = gr.Interface(
151
  fn=hybrid_search,
152
- inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
153
- outputs=gr.Textbox(lines=10, label="Search Results"),
154
- title="Patent Similarity Search",
155
- description="Enter a patent description to find similar patents based on key features."
156
  )
157
 
158
- if __name__ == "__main__":
159
- iface.launch()
 
6
  from transformers import AutoTokenizer, AutoModel
7
  from sentence_transformers import SentenceTransformer, models
8
  from sklearn.feature_extraction.text import TfidfVectorizer
 
9
  import re
 
10
  import spacy
11
+ import joblib
12
 
13
  # Load Spacy model for advanced NLP
14
  try:
 
50
  try:
51
  tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
52
  bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
53
+ model = SentenceTransformer(modules=[models.Transformer(model_name='anferico/bert-for-patents'), models.Pooling(bert_model.config.hidden_size)])
 
 
54
  except Exception as e:
55
  print(f"Error loading anferico/bert-for-patents: {e}")
56
  print("Falling back to a general-purpose model.")
 
70
  index = faiss.IndexFlatIP(embeddings.shape[1])
71
  index.add(embeddings)
72
 
73
+ # Create and save TF-IDF vectorizer
74
  tfidf_vectorizer = TfidfVectorizer(stop_words='english')
75
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
76
+ joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
77
 
78
  def extract_key_features(text):
79
  # Use Spacy to extract noun phrases and key phrases
 
99
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
100
 
101
  # Perform semantic similarity search
102
+ semantic_distances, semantic_indices = index.search(np.array([query_embedding]), top_k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  results = []
105
+ for i in range(top_k):
106
+ patent_number = patent_numbers[semantic_indices[0][i]]
107
+ patent_data = metadata[patent_number]
108
+ patent_features = extract_key_features(patent_data['text'])
109
+ common_features, similarity_score = compare_features(query_features, patent_features)
110
+
111
+ results.append({
112
+ 'patent_number': patent_number,
113
+ 'common_features': common_features,
114
+ 'similarity_score': similarity_score,
115
+ 'semantic_score': semantic_distances[0][i]
116
+ })
117
+
118
+ return results
119
 
 
120
  iface = gr.Interface(
121
  fn=hybrid_search,
122
+ inputs=gr.inputs.Textbox(label="Enter your search query"),
123
+ outputs=gr.outputs.JSON(label="Search Results")
 
 
124
  )
125
 
126
+ iface.launch()