BERT-for-Patents_Semantic-Patent-Finder-v2

Running

App Files Files Community

bhlewis commited on Jul 31

Commit

1af801b

•

1 Parent(s): cec7cfe

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -61

app.py CHANGED Viewed

@@ -3,26 +3,82 @@ import numpy as np
 import h5py
 import faiss
 import json
-from transformers import AutoTokenizer, AutoModel
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import re
 from collections import Counter
-import spacy
 import torch
-from nltk.corpus import wordnet
 import nltk
-# Download WordNet data
-nltk.download('wordnet')
-# Load Spacy model for advanced NLP
-try:
-    nlp = spacy.load("en_core_web_sm")
-except IOError:
-    print("Downloading spacy model...")
-    spacy.cli.download("en_core_web_sm")
-    nlp = spacy.load("en_core_web_sm")
 def load_data():
     try:
@@ -50,50 +106,6 @@ def load_data():
         print(f"An unexpected error occurred while loading data: {e}")
         raise
-embeddings, patent_numbers, metadata, texts = load_data()
-# Load BERT model for encoding search queries
-tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
-bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
-def encode_texts(texts, max_length=512):
-    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
-    with torch.no_grad():
-        outputs = bert_model(**inputs)
-    embeddings = outputs.last_hidden_state.mean(dim=1)
-    return embeddings.numpy()
-# Check if the embedding dimensions match
-if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
-    print("Embedding dimensions do not match. Rebuilding FAISS index.")
-    # Rebuild embeddings using the new model
-    embeddings = encode_texts(texts)
-    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
-# Normalize embeddings for cosine similarity
-embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
-# Create FAISS index for cosine similarity
-index = faiss.IndexFlatIP(embeddings.shape[1])
-index.add(embeddings)
-# Create TF-IDF vectorizer
-tfidf_vectorizer = TfidfVectorizer(stop_words='english')
-tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
-def extract_key_features(text):
-    # Use Spacy to extract technical terms and phrases
-    doc = nlp(text)
-    technical_terms = []
-    for token in doc:
-        if token.dep_ in ('amod', 'compound') or token.ent_type_ in ('PRODUCT', 'ORG', 'GPE', 'NORP'):
-            technical_terms.append(token.text.lower())
-    noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
-    feature_phrases = [sent.text.lower() for sent in doc.sents if re.search(r'(comprising|including|consisting of|deformable|insulation|heat-resistant|memory foam|high-temperature)', sent.text, re.IGNORECASE)]
-    all_features = technical_terms + noun_phrases + feature_phrases
-    return list(set(all_features))
 def compare_features(query_features, patent_features):
     common_features = set(query_features) & set(patent_features)
     similarity_score = len(common_features) / max(len(query_features), len(patent_features))
@@ -102,17 +114,18 @@ def compare_features(query_features, patent_features):
 def hybrid_search(query, top_k=5):
     print(f"Original query: {query}")
-    query_features = extract_key_features(query)
-    # Encode the query using the transformer model
-    query_embedding = encode_texts([query])[0]
     query_embedding = query_embedding / np.linalg.norm(query_embedding)
     # Perform semantic similarity search
     semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
     # Perform TF-IDF based search
-    query_tfidf = tfidf_vectorizer.transform([query])
     tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
     tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
@@ -154,6 +167,27 @@ def hybrid_search(query, top_k=5):
     return "\n".join(results)
 # Create Gradio interface with additional input fields
 iface = gr.Interface(
     fn=hybrid_search,
@@ -167,4 +201,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch()

 import h5py
 import faiss
 import json
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import re
 from collections import Counter
 import torch
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
 import nltk
+# Download necessary NLTK data
+nltk.download('stopwords', quiet=True)
+nltk.download('punkt', quiet=True)
+# Load BERT model for lemmatization
+bert_model_name = "bert-base-uncased"
+bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+bert_model = AutoModelForMaskedLM.from_pretrained(bert_model_name).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+# Load BERT model for encoding search queries
+tokenizer = AutoTokenizer.from_pretrained('anferico/bert-for-patents')
+bert_model = AutoModel.from_pretrained('anferico/bert-for-patents')
+def bert_lemmatize(text):
+    tokens = bert_tokenizer.tokenize(text)
+    input_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
+    input_tensor = torch.tensor([input_ids]).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
+    with torch.no_grad():
+        outputs = bert_model(input_tensor)
+    predictions = outputs.logits.argmax(dim=-1)
+    lemmatized_tokens = bert_tokenizer.convert_ids_to_tokens(predictions[0])
+    return ' '.join([token for token in lemmatized_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']])
+def preprocess_query(text):
+    # Convert to lowercase
+    text = text.lower()
+    # Remove any HTML tags (if present)
+    text = re.sub('<.*?>', '', text)
+    # Remove special characters, but keep hyphens, periods, and commas
+    text = re.sub(r'[^a-zA-Z0-9\s\-\.\,]', '', text)
+    # Tokenize
+    tokens = word_tokenize(text)
+    # Remove stopwords, but keep all other words
+    stop_words = set(stopwords.words('english'))
+    tokens = [word for word in tokens if word not in stop_words]
+    # Join tokens back into a string
+    processed_text = ' '.join(tokens)
+    # Apply BERT lemmatization
+    processed_text = bert_lemmatize(processed_text)
+    return processed_text
+def extract_key_features(text):
+    # For queries, we'll just preprocess and return all non-stopword terms
+    processed_text = preprocess_query(text)
+    # Split the processed text into individual terms
+    features = processed_text.split()
+    # Remove duplicates while preserving order
+    features = list(dict.fromkeys(features))
+    return features
+def encode_texts(texts, max_length=512):
+    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
+    with torch.no_grad():
+        outputs = bert_model(**inputs)
+    embeddings = outputs.last_hidden_state.mean(dim=1)
+    return embeddings.numpy()
 def load_data():
     try:
         print(f"An unexpected error occurred while loading data: {e}")
         raise
 def compare_features(query_features, patent_features):
     common_features = set(query_features) & set(patent_features)
     similarity_score = len(common_features) / max(len(query_features), len(patent_features))
 def hybrid_search(query, top_k=5):
     print(f"Original query: {query}")
+    processed_query = preprocess_query(query)
+    query_features = extract_key_features(processed_query)
+    # Encode the processed query using the transformer model
+    query_embedding = encode_texts([processed_query])[0]
     query_embedding = query_embedding / np.linalg.norm(query_embedding)
     # Perform semantic similarity search
     semantic_distances, semantic_indices = index.search(np.array([query_embedding]).astype('float32'), top_k * 2)
     # Perform TF-IDF based search
+    query_tfidf = tfidf_vectorizer.transform([processed_query])
     tfidf_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
     tfidf_indices = tfidf_similarities.argsort()[-top_k * 2:][::-1]
     return "\n".join(results)
+# Load data and prepare the FAISS index
+embeddings, patent_numbers, metadata, texts = load_data()
+# Check if the embedding dimensions match
+if embeddings.shape[1] != encode_texts(["test"]).shape[1]:
+    print("Embedding dimensions do not match. Rebuilding FAISS index.")
+    # Rebuild embeddings using the new model
+    embeddings = encode_texts(texts)
+    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Normalize embeddings for cosine similarity
+embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+# Create FAISS index for cosine similarity
+index = faiss.IndexFlatIP(embeddings.shape[1])
+index.add(embeddings)
+# Create TF-IDF vectorizer
+tfidf_vectorizer = TfidfVectorizer(stop_words='english')
+tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
 # Create Gradio interface with additional input fields
 iface = gr.Interface(
     fn=hybrid_search,
 )
 if __name__ == "__main__":
+    iface.launch()