AdGPT / retrieval_augmented_generation /build_embeddings.py
goodmodeler's picture
UPDATE: rag
a7af970
#!/usr/bin/env python3
"""
简洁版BERT+FAISS标语数据库
输入:产品/业务描述
输出:匹配的广告标语
"""
import numpy as np
import faiss
import json
from sentence_transformers import SentenceTransformer
from datasets import Dataset
import pandas as pd
class SloganDatabase:
def __init__(self):
self.encoder = SentenceTransformer('all-MiniLM-L6-v2')
self.index = None
self.slogans = []
def create_dataset(self):
"""创建标语数据集 - 珠宝首饰奢侈品领域"""
# 示例数据:[品牌, 类别, 描述, 标语]
data = [
# 顶级珠宝品牌
["Tiffany & Co.", "jewelry", "luxury diamond jewelry and engagement rings", "A Diamond is Forever"],
["Cartier", "luxury_jewelry", "high-end jewelry watches and accessories", "L'art de vivre"],
["Van Cleef & Arpels", "jewelry", "French luxury jewelry and watches", "Poetry of Time"],
["Harry Winston", "jewelry", "rare diamonds and luxury jewelry", "Rare Jewels of the World"],
["Bulgari", "jewelry", "Italian luxury jewelry and watches", "Italian Excellence"],
["Chopard", "jewelry", "Swiss luxury jewelry and watches", "Happy Diamonds"],
["Graff", "jewelry", "exceptional diamonds and jewelry", "The Most Fabulous Jewels in the World"],
["Piaget", "jewelry", "Swiss luxury watches and jewelry", "Possession"],
["Boucheron", "jewelry", "French high jewelry and luxury watches", "Le Joaillier Depuis 1858"],
["Mikimoto", "jewelry", "cultured pearl jewelry", "The Originator of Cultured Pearls"],
# 奢侈品牌
["Louis Vuitton", "luxury_fashion", "luxury leather goods and fashion", "The Art of Travel"],
["Hermès", "luxury_fashion", "French luxury goods and accessories", "Luxury in the making"],
["Chanel", "luxury_fashion", "haute couture and luxury fashion", "Inside every woman there is a flower and a cat"],
["Gucci", "luxury_fashion", "Italian luxury fashion and accessories", "Quality is remembered long after price is forgotten"],
["Prada", "luxury_fashion", "Italian luxury fashion house", "Prada"],
["Dior", "luxury_fashion", "French luxury fashion and beauty", "Miss Dior"],
["Versace", "luxury_fashion", "Italian luxury fashion design", "Virtus"],
["Saint Laurent", "luxury_fashion", "French luxury fashion house", "Saint Laurent Paris"],
["Balenciaga", "luxury_fashion", "Spanish luxury fashion house", "Balenciaga"],
["Bottega Veneta", "luxury_fashion", "Italian luxury leather goods", "When your own initials are enough"],
# 腕表品牌
["Rolex", "luxury_watches", "Swiss luxury watches and timepieces", "Perpetual, Spirit of Excellence"],
["Patek Philippe", "luxury_watches", "Swiss luxury watch manufacturer", "You never actually own a Patek Philippe"],
["Audemars Piguet", "luxury_watches", "Swiss luxury watch brand", "To break the rules, you must first master them"],
["Omega", "luxury_watches", "Swiss luxury watch manufacturer", "Precision"],
["TAG Heuer", "luxury_watches", "Swiss luxury watches", "Don't crack under pressure"],
["Breitling", "luxury_watches", "Swiss luxury watchmaker", "Instruments for Professionals"],
["IWC", "luxury_watches", "Swiss luxury watch company", "Engineered for men"],
["Jaeger-LeCoultre", "luxury_watches", "Swiss luxury watch manufacturer", "The World's Most Complicated Watches"],
["Vacheron Constantin", "luxury_watches", "Swiss luxury watch manufacturer", "One of Not Many"],
["A. Lange & Söhne", "luxury_watches", "German luxury watch manufacturer", "When nothing else will do"],
# 时尚首饰
["Pandora", "fashion_jewelry", "Danish jewelry brand charm bracelets", "Be Love"],
["Swarovski", "fashion_jewelry", "Austrian crystal jewelry and accessories", "Unleash Your Light"],
["Daniel Wellington", "fashion_watches", "Swedish watch brand minimalist design", "Live the moment"],
["Alex and Ani", "fashion_jewelry", "American jewelry brand spiritual bracelets", "Positive Energy"],
["Kendra Scott", "fashion_jewelry", "American jewelry designer colorful stones", "Live colorfully"],
["Monica Vinader", "fashion_jewelry", "British jewelry brand contemporary design", "Everyday luxury"],
["Mejuri", "fashion_jewelry", "Canadian jewelry brand everyday luxury", "Everyday fine"],
["Gorjana", "fashion_jewelry", "California jewelry brand layered necklaces", "Live your layer"],
["Kate Spade", "fashion_jewelry", "American fashion accessories jewelry", "Live colorfully"],
["Marc Jacobs", "fashion_jewelry", "American fashion designer accessories", "Marc Jacobs"],
# 珠宝定制
["Blue Nile", "diamond_jewelry", "online diamond jewelry retailer", "Extraordinary diamonds for extraordinary moments"],
["James Allen", "diamond_jewelry", "online engagement ring retailer", "See it. Love it. Own it."],
["Brilliant Earth", "diamond_jewelry", "ethical diamond jewelry", "Brilliant Earth"],
["With Clarity", "diamond_jewelry", "lab-grown diamond jewelry", "Diamonds. Redefined."],
["Clean Origin", "diamond_jewelry", "lab-created diamond jewelry", "Grown with love"],
["Ritani", "diamond_jewelry", "engagement rings and wedding bands", "Love is in the details"],
["Vrai", "diamond_jewelry", "lab-grown diamond jewelry", "Created, not mined"],
["Catbird", "jewelry", "Brooklyn-based jewelry designer", "Made in Brooklyn"],
["Wwake", "jewelry", "contemporary fine jewelry designer", "Wwake"],
["Jacquie Aiche", "jewelry", "California jewelry designer bohemian luxury", "Jacquie Aiche"],
# 中国珠宝品牌
["周大福", "jewelry", "香港珠宝品牌黄金钻石", "心意足金"],
["周生生", "jewelry", "香港珠宝品牌传统工艺", "传承经典"],
["老凤祥", "jewelry", "中国传统珠宝品牌黄金首饰", "老凤祥,真金不怕火炼"],
["六福珠宝", "jewelry", "香港珠宝品牌时尚设计", "六福临门"],
["潘多拉", "jewelry", "丹麦珠宝品牌串珠手链", "表达你的故事"],
["周大生", "jewelry", "中国珠宝品牌钻石首饰", "爱就在一起"],
["金伯利", "jewelry", "中国钻石珠宝品牌", "只为更好的你"],
["戴比尔斯", "diamond_jewelry", "钻石开采珠宝品牌", "钻石恒久远,一颗永流传"],
["施华洛世奇", "crystal_jewelry", "奥地利水晶珠宝品牌", "释放你的光芒"],
["谢瑞麟", "jewelry", "香港珠宝设计师品牌", "艺术珠宝"],
# 奢侈品配饰
["Goyard", "luxury_accessories", "French luxury leather goods", "Goyard"],
["Moynat", "luxury_accessories", "French luxury leather goods", "Moynat"],
["Berluti", "luxury_accessories", "French luxury leather goods", "Berluti"],
["Valextra", "luxury_accessories", "Italian luxury leather goods", "Milanese excellence since 1937"],
["Loewe", "luxury_accessories", "Spanish luxury leather goods", "Craft"],
["Brunello Cucinelli", "luxury_fashion", "Italian luxury fashion cashmere", "Humanistic Enterprise"],
["Loro Piana", "luxury_fashion", "Italian luxury textile and clothing", "Excellence in natural fibers"],
["Kiton", "luxury_fashion", "Italian luxury menswear", "The most beautiful thing made by man"],
["Zegna", "luxury_fashion", "Italian luxury menswear", "What makes a man"],
["Brioni", "luxury_fashion", "Italian luxury menswear", "Roman style"],
# 新兴奢侈品牌
["Jacquemus", "luxury_fashion", "French luxury fashion house", "La Montagne"],
["Ganni", "luxury_fashion", "Danish fashion brand", "Ganni"],
["Staud", "luxury_fashion", "American fashion brand", "Staud"],
["Cult Gaia", "luxury_accessories", "American accessories brand", "Cult Gaia"],
["Rosantica", "jewelry", "Italian jewelry brand", "Rosantica"],
["Alighieri", "jewelry", "British jewelry brand", "The Inferno"],
["Lizzie Fortunato", "jewelry", "American jewelry brand", "Lizzie Fortunato"],
["Aurate", "jewelry", "American jewelry brand", "Accessible luxury"],
["AUrate New York", "jewelry", "New York jewelry brand", "Radically responsible luxury"],
["Missoma", "jewelry", "British jewelry brand", "Missoma"]
]
# 转换为DataFrame
df = pd.DataFrame(data, columns=['brand', 'category', 'description', 'slogan'])
# 创建搜索文本(组合描述信息)
df['search_text'] = df['brand'] + ' ' + df['category'] + ' ' + df['description']
return df.to_dict('records')
def build_index(self, data):
"""构建FAISS索引"""
print("🔨 Building FAISS index...")
# 提取搜索文本
texts = [item['search_text'] for item in data]
# 生成embeddings
embeddings = self.encoder.encode(texts, show_progress_bar=True)
# 构建索引
self.index = faiss.IndexFlatIP(384) # 使用内积相似度
self.index.add(embeddings.astype('float32'))
# 保存数据
self.slogans = data
print(f"✅ Index built with {len(data)} slogans")
def search(self, query, k=5):
"""搜索相似标语"""
if not self.index:
raise ValueError("Index not built yet!")
# 编码查询
query_embedding = self.encoder.encode([query])
# 搜索
scores, indices = self.index.search(query_embedding.astype('float32'), k)
# 返回结果
results = []
for score, idx in zip(scores[0], indices[0]):
if idx < len(self.slogans):
result = self.slogans[idx].copy()
result['similarity_score'] = float(score)
results.append(result)
return results
def save(self, path="slogan_db"):
"""保存数据库"""
# 保存FAISS索引
faiss.write_index(self.index, f"{path}.faiss")
# 保存标语数据
with open(f"{path}.json", 'w', encoding='utf-8') as f:
json.dump(self.slogans, f, ensure_ascii=False, indent=2)
print(f"💾 Database saved to {path}")
def load(self, path="slogan_db"):
"""加载数据库"""
try:
# 加载FAISS索引
self.index = faiss.read_index(f"{path}.faiss")
# 加载标语数据
with open(f"{path}.json", 'r', encoding='utf-8') as f:
self.slogans = json.load(f)
print(f"📂 Database loaded from {path}")
return True
except:
print(f"❌ Failed to load database from {path}")
return False
def main():
"""主函数"""
print("🚀 Creating Slogan Database...")
# 初始化
db = SloganDatabase()
# 尝试加载现有数据库
if not db.load():
print("📊 Creating new database...")
# 创建数据集
data = db.create_dataset()
# 构建索引
db.build_index(data)
# 保存数据库
db.save()
# 测试搜索
test_queries = [
"钻石订婚戒指",
"奢侈品手袋",
"瑞士手表品牌",
"珍珠首饰",
"黄金项链",
"时尚耳环",
"luxury jewelry brand",
"designer handbag",
"crystal accessories",
"wedding rings"
]
print("\n🔍 Testing searches...")
for query in test_queries:
print(f"\n查询: {query}")
print("-" * 40)
results = db.search(query, k=3)
for i, result in enumerate(results, 1):
print(f"{i}. {result['brand']} ({result['category']})")
print(f" 描述: {result['description']}")
print(f" 标语: {result['slogan']}")
print(f" 相似度: {result['similarity_score']:.3f}")
print()
if __name__ == "__main__":
main()