import gradio as gr import numpy as np import h5py import faiss import json from sentence_transformers import SentenceTransformer def load_data(): with h5py.File('patent_embeddings.h5', 'r') as f: embeddings = f['embeddings'][:] patent_numbers = [pn.decode('utf-8') for pn in f['patent_numbers'][:]] metadata = {} with open('patent_metadata.jsonl', 'r') as f: for line in f: data = json.loads(line) metadata[data['patent_number']] = data print(f"Embedding shape: {embeddings.shape}") print(f"Number of patent numbers: {len(patent_numbers)}") print(f"Number of metadata entries: {len(metadata)}") # Print sample metadata sample_patent = next(iter(metadata)) print(f"Sample metadata for patent {sample_patent}:") print(json.dumps(metadata[sample_patent], indent=2)) return embeddings, patent_numbers, metadata embeddings, patent_numbers, metadata = load_data() # Create FAISS index index = faiss.IndexFlatL2(embeddings.shape[1]) index.add(embeddings) # Load BERT model for encoding search queries embedding_dim = embeddings.shape[1] print(f"Embedding dimension: {embedding_dim}") if embedding_dim == 384: model = SentenceTransformer('all-MiniLM-L6-v2') elif embedding_dim == 768: model = SentenceTransformer('all-mpnet-base-v2') else: print(f"Unexpected embedding dimension: {embedding_dim}") model = SentenceTransformer('all-MiniLM-L6-v2') # Default to this model def search(query, top_k=5): # Encode the query query_embedding = model.encode([query])[0] # Ensure the query embedding has the same dimension as the index if query_embedding.shape[0] != index.d: print(f"Query embedding dimension ({query_embedding.shape[0]}) does not match index dimension ({index.d})") # Option 1: Pad or truncate the query embedding if query_embedding.shape[0] < index.d: query_embedding = np.pad(query_embedding, (0, index.d - query_embedding.shape[0])) else: query_embedding = query_embedding[:index.d] # Perform similarity search distances, indices = index.search(np.array([query_embedding]), top_k) results = [] for i, idx in enumerate(indices[0]): patent_number = patent_numbers[idx] if patent_number not in metadata: print(f"Warning: Patent number {patent_number} not found in metadata") continue patent_data = metadata[patent_number] result = f"Patent Number: {patent_number}\n" # Safely extract abstract abstract = patent_data.get('abstract', 'No abstract available') if isinstance(abstract, str): result += f"Abstract: {abstract[:200]}...\n" else: result += f"Abstract: Unable to display (type: {type(abstract)})\n" result += f"Similarity Score: {1 - distances[0][i]:.4f}\n\n" results.append(result) return "\n".join(results) # Create Gradio interface iface = gr.Interface( fn=search, inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."), outputs=gr.Textbox(lines=10, label="Search Results"), title="Patent Similarity Search", description="Enter a query to find similar patents based on their embeddings." ) if __name__ == "__main__": iface.launch(share=True)