Update app.py
Browse files
app.py
CHANGED
@@ -10,6 +10,11 @@ import re
|
|
10 |
from collections import Counter
|
11 |
import spacy
|
12 |
import torch
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Load Spacy model for advanced NLP
|
15 |
try:
|
@@ -96,20 +101,33 @@ def compare_features(query_features, patent_features):
|
|
96 |
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
|
97 |
return common_features, similarity_score
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
def hybrid_search(query, top_k=5):
|
100 |
print(f"Original query: {query}")
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
|
104 |
# Encode the query using the transformer model
|
105 |
-
query_embedding = encode_texts([
|
106 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
107 |
|
108 |
# Perform semantic similarity search
|
109 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
|
110 |
|
111 |
# Perform TF-IDF based search
|
112 |
-
query_tfidf = tfidf_vectorizer.transform([
|
113 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
114 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
115 |
|
@@ -121,7 +139,7 @@ def hybrid_search(query, top_k=5):
|
|
121 |
patent_features = extract_key_features(text)
|
122 |
common_features, feature_similarity = compare_features(query_features, patent_features)
|
123 |
combined_results[patent_number] = {
|
124 |
-
'score': semantic_distances[0][i] * 1.5 + feature_similarity,
|
125 |
'common_features': common_features,
|
126 |
'text': text
|
127 |
}
|
@@ -133,7 +151,7 @@ def hybrid_search(query, top_k=5):
|
|
133 |
patent_features = extract_key_features(text)
|
134 |
common_features, feature_similarity = compare_features(query_features, patent_features)
|
135 |
combined_results[patent_number] = {
|
136 |
-
'score': tfidf_similarities[idx] + feature_similarity,
|
137 |
'common_features': common_features,
|
138 |
'text': text
|
139 |
}
|
@@ -151,10 +169,13 @@ def hybrid_search(query, top_k=5):
|
|
151 |
|
152 |
return "\n".join(results)
|
153 |
|
154 |
-
# Create Gradio interface
|
155 |
iface = gr.Interface(
|
156 |
fn=hybrid_search,
|
157 |
-
inputs=
|
|
|
|
|
|
|
158 |
outputs=gr.Textbox(lines=10, label="Search Results"),
|
159 |
title="Patent Similarity Search",
|
160 |
description="Enter a patent description to find similar patents based on key features."
|
|
|
10 |
from collections import Counter
|
11 |
import spacy
|
12 |
import torch
|
13 |
+
from nltk.corpus import wordnet
|
14 |
+
import nltk
|
15 |
+
|
16 |
+
# Download WordNet data
|
17 |
+
nltk.download('wordnet')
|
18 |
|
19 |
# Load Spacy model for advanced NLP
|
20 |
try:
|
|
|
101 |
similarity_score = len(common_features) / max(len(query_features), len(patent_features))
|
102 |
return common_features, similarity_score
|
103 |
|
104 |
+
def expand_query(query):
|
105 |
+
expanded_query = query
|
106 |
+
for word in query.split():
|
107 |
+
synonyms = wordnet.synsets(word)
|
108 |
+
for syn in synonyms:
|
109 |
+
for lemma in syn.lemmas():
|
110 |
+
expanded_query += " " + lemma.name()
|
111 |
+
return expanded_query
|
112 |
+
|
113 |
def hybrid_search(query, top_k=5):
|
114 |
print(f"Original query: {query}")
|
115 |
|
116 |
+
# Expand the query
|
117 |
+
expanded_query = expand_query(query)
|
118 |
+
print(f"Expanded query: {expanded_query}")
|
119 |
+
|
120 |
+
query_features = extract_key_features(expanded_query)
|
121 |
|
122 |
# Encode the query using the transformer model
|
123 |
+
query_embedding = encode_texts([expanded_query])[0]
|
124 |
query_embedding = query_embedding / np.linalg.norm(query_embedding)
|
125 |
|
126 |
# Perform semantic similarity search
|
127 |
semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
|
128 |
|
129 |
# Perform TF-IDF based search
|
130 |
+
query_tfidf = tfidf_vectorizer.transform([expanded_query])
|
131 |
tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
|
132 |
tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
|
133 |
|
|
|
139 |
patent_features = extract_key_features(text)
|
140 |
common_features, feature_similarity = compare_features(query_features, patent_features)
|
141 |
combined_results[patent_number] = {
|
142 |
+
'score': semantic_distances[0][i] * 1.0 + tfidf_similarities[idx] * 0.5 + feature_similarity,
|
143 |
'common_features': common_features,
|
144 |
'text': text
|
145 |
}
|
|
|
151 |
patent_features = extract_key_features(text)
|
152 |
common_features, feature_similarity = compare_features(query_features, patent_features)
|
153 |
combined_results[patent_number] = {
|
154 |
+
'score': tfidf_similarities[idx] * 1.0 + feature_similarity,
|
155 |
'common_features': common_features,
|
156 |
'text': text
|
157 |
}
|
|
|
169 |
|
170 |
return "\n".join(results)
|
171 |
|
172 |
+
# Create Gradio interface with additional input fields
|
173 |
iface = gr.Interface(
|
174 |
fn=hybrid_search,
|
175 |
+
inputs=[
|
176 |
+
gr.Textbox(lines=2, placeholder="Enter your patent query here..."),
|
177 |
+
gr.Slider(minimum=1, maximum=20, step=1, default=5, label="Top K Results"),
|
178 |
+
],
|
179 |
outputs=gr.Textbox(lines=10, label="Search Results"),
|
180 |
title="Patent Similarity Search",
|
181 |
description="Enter a patent description to find similar patents based on key features."
|