bhlewis commited on
Commit
e503f85
·
verified ·
1 Parent(s): 60465e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -7
app.py CHANGED
@@ -10,6 +10,11 @@ import re
10
  from collections import Counter
11
  import spacy
12
  import torch
 
 
 
 
 
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
@@ -96,20 +101,33 @@ def compare_features(query_features, patent_features):
96
  similarity_score = len(common_features) / max(len(query_features), len(patent_features))
97
  return common_features, similarity_score
98
 
 
 
 
 
 
 
 
 
 
99
  def hybrid_search(query, top_k=5):
100
  print(f"Original query: {query}")
101
 
102
- query_features = extract_key_features(query)
 
 
 
 
103
 
104
  # Encode the query using the transformer model
105
- query_embedding = encode_texts([query])[0]
106
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
107
 
108
  # Perform semantic similarity search
109
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
110
 
111
  # Perform TF-IDF based search
112
- query_tfidf = tfidf_vectorizer.transform([query])
113
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
114
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
115
 
@@ -121,7 +139,7 @@ def hybrid_search(query, top_k=5):
121
  patent_features = extract_key_features(text)
122
  common_features, feature_similarity = compare_features(query_features, patent_features)
123
  combined_results[patent_number] = {
124
- 'score': semantic_distances[0][i] * 1.5 + feature_similarity,
125
  'common_features': common_features,
126
  'text': text
127
  }
@@ -133,7 +151,7 @@ def hybrid_search(query, top_k=5):
133
  patent_features = extract_key_features(text)
134
  common_features, feature_similarity = compare_features(query_features, patent_features)
135
  combined_results[patent_number] = {
136
- 'score': tfidf_similarities[idx] + feature_similarity,
137
  'common_features': common_features,
138
  'text': text
139
  }
@@ -151,10 +169,13 @@ def hybrid_search(query, top_k=5):
151
 
152
  return "\n".join(results)
153
 
154
- # Create Gradio interface
155
  iface = gr.Interface(
156
  fn=hybrid_search,
157
- inputs=gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
 
 
 
158
  outputs=gr.Textbox(lines=10, label="Search Results"),
159
  title="Patent Similarity Search",
160
  description="Enter a patent description to find similar patents based on key features."
 
10
  from collections import Counter
11
  import spacy
12
  import torch
13
+ from nltk.corpus import wordnet
14
+ import nltk
15
+
16
+ # Download WordNet data
17
+ nltk.download('wordnet')
18
 
19
  # Load Spacy model for advanced NLP
20
  try:
 
101
  similarity_score = len(common_features) / max(len(query_features), len(patent_features))
102
  return common_features, similarity_score
103
 
104
+ def expand_query(query):
105
+ expanded_query = query
106
+ for word in query.split():
107
+ synonyms = wordnet.synsets(word)
108
+ for syn in synonyms:
109
+ for lemma in syn.lemmas():
110
+ expanded_query += " " + lemma.name()
111
+ return expanded_query
112
+
113
  def hybrid_search(query, top_k=5):
114
  print(f"Original query: {query}")
115
 
116
+ # Expand the query
117
+ expanded_query = expand_query(query)
118
+ print(f"Expanded query: {expanded_query}")
119
+
120
+ query_features = extract_key_features(expanded_query)
121
 
122
  # Encode the query using the transformer model
123
+ query_embedding = encode_texts([expanded_query])[0]
124
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
125
 
126
  # Perform semantic similarity search
127
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
128
 
129
  # Perform TF-IDF based search
130
+ query_tfidf = tfidf_vectorizer.transform([expanded_query])
131
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
132
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
133
 
 
139
  patent_features = extract_key_features(text)
140
  common_features, feature_similarity = compare_features(query_features, patent_features)
141
  combined_results[patent_number] = {
142
+ 'score': semantic_distances[0][i] * 1.0 + tfidf_similarities[idx] * 0.5 + feature_similarity,
143
  'common_features': common_features,
144
  'text': text
145
  }
 
151
  patent_features = extract_key_features(text)
152
  common_features, feature_similarity = compare_features(query_features, patent_features)
153
  combined_results[patent_number] = {
154
+ 'score': tfidf_similarities[idx] * 1.0 + feature_similarity,
155
  'common_features': common_features,
156
  'text': text
157
  }
 
169
 
170
  return "\n".join(results)
171
 
172
+ # Create Gradio interface with additional input fields
173
  iface = gr.Interface(
174
  fn=hybrid_search,
175
+ inputs=[
176
+ gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
177
+ gr.Slider(minimum=1, maximum=20, step=1, default=5, label="Top K Results"),
178
+ ],
179
  outputs=gr.Textbox(lines=10, label="Search Results"),
180
  title="Patent Similarity Search",
181
  description="Enter a patent description to find similar patents based on key features."