bhlewis commited on
Commit
13d437f
·
verified ·
1 Parent(s): dc870ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -16
app.py CHANGED
@@ -6,12 +6,12 @@ import json
6
  from sentence_transformers import SentenceTransformer
7
 
8
  def load_data():
9
- with h5py.File('patent_embeddings.h5', 'r') as f:
10
  embeddings = f['embeddings'][:]
11
- patent_numbers = [pn.decode('utf-8') for pn in f['patent_numbers'][:]]
12
 
13
  metadata = {}
14
- with open('patent_metadata.jsonl', 'r') as f:
15
  for line in f:
16
  data = json.loads(line)
17
  metadata[data['patent_number']] = data
@@ -36,14 +36,12 @@ model = SentenceTransformer('all-mpnet-base-v2')
36
 
37
  def exact_text_match(query, metadata):
38
  matches = []
 
39
  for patent_number, data in metadata.items():
40
- abstract = data.get('abstract', '')
41
- claims = data.get('claims', '')
42
- if isinstance(abstract, float):
43
- abstract = ''
44
- if isinstance(claims, float):
45
- claims = ''
46
- if query.lower() in abstract.lower() or query.lower() in claims.lower():
47
  matches.append((patent_number, 1.0)) # Score of 1.0 for exact match
48
  return matches
49
 
@@ -63,14 +61,14 @@ def search(query, top_k=5):
63
 
64
  results = []
65
  for i, idx in enumerate(indices[0]):
66
- patent_number = patent_numbers[idx]
67
  if patent_number not in metadata:
68
  print(f"Warning: Patent number {patent_number} not found in metadata")
69
  continue
70
  patent_data = metadata[patent_number]
71
  result = f"Patent Number: {patent_number}\n"
72
- abstract = patent_data.get('abstract', 'No abstract available')
73
- result += f"Abstract: {abstract[:200]}...\n"
74
  result += f"Similarity Score: {distances[0][i]:.4f}\n\n"
75
  results.append(result)
76
 
@@ -78,11 +76,15 @@ def search(query, top_k=5):
78
  if not results or distances[0][0] < 0.5:
79
  print("Falling back to exact text match")
80
  exact_matches = exact_text_match(query, metadata)
 
 
 
 
81
  for patent_number, score in exact_matches[:top_k]:
82
  patent_data = metadata[patent_number]
83
  result = f"Patent Number: {patent_number}\n"
84
- abstract = patent_data.get('abstract', 'No abstract available')
85
- result += f"Abstract: {abstract[:200]}...\n"
86
  result += f"Exact Match Score: {score:.4f}\n\n"
87
  results.append(result)
88
 
@@ -98,4 +100,4 @@ iface = gr.Interface(
98
  )
99
 
100
  if __name__ == "__main__":
101
- iface.launch(share=True)
 
6
  from sentence_transformers import SentenceTransformer
7
 
8
  def load_data():
9
+ with h5py.File('path_to_your_embeddings_file.h5', 'r') as f:
10
  embeddings = f['embeddings'][:]
11
+ patent_numbers = f['patent_numbers'][:]
12
 
13
  metadata = {}
14
+ with open('path_to_your_metadata_file.jsonl', 'r') as f:
15
  for line in f:
16
  data = json.loads(line)
17
  metadata[data['patent_number']] = data
 
36
 
37
  def exact_text_match(query, metadata):
38
  matches = []
39
+ query_lower = query.lower()
40
  for patent_number, data in metadata.items():
41
+ text = data.get('text', '')
42
+ if isinstance(text, float):
43
+ text = ''
44
+ if query_lower in text.lower():
 
 
 
45
  matches.append((patent_number, 1.0)) # Score of 1.0 for exact match
46
  return matches
47
 
 
61
 
62
  results = []
63
  for i, idx in enumerate(indices[0]):
64
+ patent_number = patent_numbers[idx].decode('utf-8')
65
  if patent_number not in metadata:
66
  print(f"Warning: Patent number {patent_number} not found in metadata")
67
  continue
68
  patent_data = metadata[patent_number]
69
  result = f"Patent Number: {patent_number}\n"
70
+ text = patent_data.get('text', 'No text available')
71
+ result += f"Text: {text[:200]}...\n"
72
  result += f"Similarity Score: {distances[0][i]:.4f}\n\n"
73
  results.append(result)
74
 
 
76
  if not results or distances[0][0] < 0.5:
77
  print("Falling back to exact text match")
78
  exact_matches = exact_text_match(query, metadata)
79
+ if exact_matches:
80
+ print(f"Exact matches found: {exact_matches}")
81
+ else:
82
+ print("No exact matches found")
83
  for patent_number, score in exact_matches[:top_k]:
84
  patent_data = metadata[patent_number]
85
  result = f"Patent Number: {patent_number}\n"
86
+ text = patent_data.get('text', 'No text available')
87
+ result += f"Text: {text[:200]}...\n"
88
  result += f"Exact Match Score: {score:.4f}\n\n"
89
  results.append(result)
90
 
 
100
  )
101
 
102
  if __name__ == "__main__":
103
+ iface.launch()