bhlewis commited on
Commit
5adb259
·
verified ·
1 Parent(s): 071fbb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -25
app.py CHANGED
@@ -10,11 +10,6 @@ import re
10
  from collections import Counter
11
  import spacy
12
  import torch
13
- from nltk.corpus import wordnet
14
- import nltk
15
-
16
- # Download WordNet data
17
- nltk.download('wordnet')
18
 
19
  # Load Spacy model for advanced NLP
20
  try:
@@ -82,12 +77,16 @@ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
82
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
83
 
84
  def extract_key_features(text):
85
- # Use Spacy to extract noun phrases and key phrases
86
  doc = nlp(text)
 
 
 
 
87
  noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
88
- feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of)', sent.text, re.IGNORECASE)]
89
 
90
- all_features = noun_phrases + feature_phrases
91
  return list(set(all_features))
92
 
93
  def compare_features(query_features, patent_features):
@@ -95,33 +94,20 @@ def compare_features(query_features, patent_features):
95
  similarity_score = len(common_features) / max(len(query_features), len(patent_features))
96
  return common_features, similarity_score
97
 
98
- def expand_query(query):
99
- expanded_query = query
100
- for word in query.split():
101
- synonyms = wordnet.synsets(word)
102
- for syn in synonyms:
103
- for lemma in syn.lemmas():
104
- expanded_query += " " + lemma.name()
105
- return expanded_query
106
-
107
  def hybrid_search(query, top_k=5):
108
  print(f"Original query: {query}")
109
 
110
- # Expand the query
111
- expanded_query = expand_query(query)
112
- print(f"Expanded query: {expanded_query}")
113
-
114
- query_features = extract_key_features(expanded_query)
115
 
116
  # Encode the query using the transformer model
117
- query_embedding = encode_texts([expanded_query])[0]
118
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
119
 
120
  # Perform semantic similarity search
121
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
122
 
123
  # Perform TF-IDF based search
124
- query_tfidf = tfidf_vectorizer.transform([expanded_query])
125
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
126
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
127
 
@@ -161,7 +147,7 @@ def hybrid_search(query, top_k=5):
161
  result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
162
  results.append(result)
163
 
164
- return "\n".join(results)
165
 
166
  # Create Gradio interface with additional input fields
167
  iface = gr.Interface(
 
10
  from collections import Counter
11
  import spacy
12
  import torch
 
 
 
 
 
13
 
14
  # Load Spacy model for advanced NLP
15
  try:
 
77
  tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
78
 
79
  def extract_key_features(text):
80
+ # Use Spacy to extract technical terms and phrases
81
  doc = nlp(text)
82
+ technical_terms = []
83
+ for token in doc:
84
+ if token.dep_ in ('amod', 'compound') or token.ent_type_ in ('PRODUCT', 'ORG', 'GPE', 'NORP'):
85
+ technical_terms.append(token.text.lower())
86
  noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
87
+ feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of|deformable|insulation|heat-resistant|memory foam|high-temperature)', sent.text, re.IGNORECASE)]
88
 
89
+ all_features = technical_terms + noun_phrases + feature_phrases
90
  return list(set(all_features))
91
 
92
  def compare_features(query_features, patent_features):
 
94
  similarity_score = len(common_features) / max(len(query_features), len(patent_features))
95
  return common_features, similarity_score
96
 
 
 
 
 
 
 
 
 
 
97
  def hybrid_search(query, top_k=5):
98
  print(f"Original query: {query}")
99
 
100
+ query_features = extract_key_features(query)
 
 
 
 
101
 
102
  # Encode the query using the transformer model
103
+ query_embedding = encode_texts([query])[0]
104
  query_embedding = query_embedding / np.linalg.norm(query_embedding)
105
 
106
  # Perform semantic similarity search
107
  semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
108
 
109
  # Perform TF-IDF based search
110
+ query_tfidf = tfidf_vectorizer.transform([query])
111
  tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
112
  tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
113
 
 
147
  result += f"Common Key Features: {', '.join(data['common_features'])}\n\n"
148
  results.append(result)
149
 
150
+ return "\n.join(results)
151
 
152
  # Create Gradio interface with additional input fields
153
  iface = gr.Interface(