File size: 2,827 Bytes
663d1ad 6744e1a 663d1ad 4a2057c 663d1ad 4a2057c 74523b8 663d1ad 4a2057c 663d1ad 4a2057c 663d1ad 4a2057c 74523b8 663d1ad 4a2057c 086c46f 663d1ad 13d437f c498c82 663d1ad 13d437f 4a2057c 663d1ad f14f1f4 663d1ad 74523b8 13d437f ab836f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from sentence_transformers import SentenceTransformer
def load_data():
try:
with h5py.File('patent_embeddings.h5', 'r') as f:
embeddings = f['embeddings'][:]
patent_numbers = f['patent_numbers'][:]
metadata = {}
with open('patent_metadata.jsonl', 'r') as f:
for line in f:
data = json.loads(line)
metadata[data['patent_number']] = data
print(f"Embedding shape: {embeddings.shape}")
print(f"Number of patent numbers: {len(patent_numbers)}")
print(f"Number of metadata entries: {len(metadata)}")
return embeddings, patent_numbers, metadata
except FileNotFoundError as e:
print(f"Error: Could not find file. {e}")
raise
except Exception as e:
print(f"An unexpected error occurred while loading data: {e}")
raise
embeddings, patent_numbers, metadata = load_data()
# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)
# Load BERT model for encoding search queries
model = SentenceTransformer('all-mpnet-base-v2')
def search(query, top_k=5):
print(f"Searching for: {query}")
# Encode the query
query_embedding = model.encode([query])[0]
query_embedding = query_embedding / np.linalg.norm(query_embedding)
print(f"Query embedding shape: {query_embedding.shape}")
# Perform similarity search
distances, indices = index.search(np.array([query_embedding]), top_k)
print(f"FAISS search results - Distances: {distances}, Indices: {indices}")
results = []
for i, idx in enumerate(indices[0]):
patent_number = patent_numbers[idx].decode('utf-8')
if patent_number not in metadata:
print(f"Warning: Patent number {patent_number} not found in metadata")
continue
patent_data = metadata[patent_number]
result = f"Patent Number: {patent_number}\n"
text = patent_data.get('text', 'No text available')
result += f"Text: {text[:200]}...\n"
result += f"Similarity Score: {distances[0][i]:.4f}\n\n"
results.append(result)
return "\n".join(results[:top_k])
# Create Gradio interface
iface = gr.Interface(
fn=search,
inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
outputs=gr.Textbox(lines=10, label="Search Results"),
title="Patent Similarity Search",
description="Enter a query to find similar patents based on their embeddings."
)
if __name__ == "__main__":
iface.launch()
|