File size: 2,827 Bytes
663d1ad
 
 
 
 
 
 
 
6744e1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
663d1ad
 
 
4a2057c
 
 
 
 
663d1ad
 
 
4a2057c
74523b8
663d1ad
4a2057c
 
663d1ad
 
4a2057c
663d1ad
4a2057c
74523b8
663d1ad
 
 
4a2057c
086c46f
663d1ad
 
13d437f
c498c82
 
 
663d1ad
 
13d437f
 
4a2057c
663d1ad
 
f14f1f4
663d1ad
 
 
 
 
 
 
 
 
 
74523b8
13d437f
ab836f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import numpy as np
import h5py
import faiss
import json
from sentence_transformers import SentenceTransformer

def load_data():
    try:
        with h5py.File('patent_embeddings.h5', 'r') as f:
            embeddings = f['embeddings'][:]
            patent_numbers = f['patent_numbers'][:]
        
        metadata = {}
        with open('patent_metadata.jsonl', 'r') as f:
            for line in f:
                data = json.loads(line)
                metadata[data['patent_number']] = data
        
        print(f"Embedding shape: {embeddings.shape}")
        print(f"Number of patent numbers: {len(patent_numbers)}")
        print(f"Number of metadata entries: {len(metadata)}")
        
        return embeddings, patent_numbers, metadata
    except FileNotFoundError as e:
        print(f"Error: Could not find file. {e}")
        raise
    except Exception as e:
        print(f"An unexpected error occurred while loading data: {e}")
        raise

embeddings, patent_numbers, metadata = load_data()

# Normalize embeddings for cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index for cosine similarity
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

# Load BERT model for encoding search queries
model = SentenceTransformer('all-mpnet-base-v2')

def search(query, top_k=5):
    print(f"Searching for: {query}")
    
    # Encode the query
    query_embedding = model.encode([query])[0]
    query_embedding = query_embedding / np.linalg.norm(query_embedding)
    
    print(f"Query embedding shape: {query_embedding.shape}")
    
    # Perform similarity search
    distances, indices = index.search(np.array([query_embedding]), top_k)
    
    print(f"FAISS search results - Distances: {distances}, Indices: {indices}")
    
    results = []
    for i, idx in enumerate(indices[0]):
        patent_number = patent_numbers[idx].decode('utf-8')
        if patent_number not in metadata:
            print(f"Warning: Patent number {patent_number} not found in metadata")
            continue
        patent_data = metadata[patent_number]
        result = f"Patent Number: {patent_number}\n"
        text = patent_data.get('text', 'No text available')
        result += f"Text: {text[:200]}...\n"
        result += f"Similarity Score: {distances[0][i]:.4f}\n\n"
        results.append(result)
    
    return "\n".join(results[:top_k])

# Create Gradio interface
iface = gr.Interface(
    fn=search,
    inputs=gr.Textbox(lines=2, placeholder="Enter your search query here..."),
    outputs=gr.Textbox(lines=10, label="Search Results"),
    title="Patent Similarity Search",
    description="Enter a query to find similar patents based on their embeddings."
)

if __name__ == "__main__":
    iface.launch()