|
import asyncio |
|
from factool.knowledge_qa.google_serper import GoogleSerperAPIWrapper |
|
from factool.utils.openai_wrapper import OpenAIEmbed |
|
import json |
|
import os |
|
import numpy as np |
|
import jsonlines |
|
import pdb |
|
|
|
class google_search(): |
|
def __init__(self, snippet_cnt): |
|
self.serper = GoogleSerperAPIWrapper(snippet_cnt=snippet_cnt) |
|
|
|
async def run(self, queries): |
|
return await self.serper.run(queries) |
|
|
|
class local_search(): |
|
def __init__(self, snippet_cnt, data_link, embedding_link=None): |
|
self.snippet_cnt = snippet_cnt |
|
self.data_link = data_link |
|
self.embedding_link = embedding_link |
|
self.openai_embed = OpenAIEmbed() |
|
self.data = None |
|
self.embedding = None |
|
asyncio.run(self.init_async()) |
|
|
|
|
|
async def init_async(self): |
|
print("init local search") |
|
self.load_data_by_link() |
|
if self.embedding_link is None: |
|
await self.calculate_embedding() |
|
else: |
|
self.load_embedding_by_link() |
|
print("loaded data and embedding") |
|
|
|
def add_suffix_to_json_filename(self, filename): |
|
base_name, extension = os.path.splitext(filename) |
|
return base_name + '_embed' + extension |
|
|
|
def load_data_by_link(self): |
|
|
|
self.data = [] |
|
|
|
with jsonlines.open(self.data_link) as reader: |
|
for obj in reader: |
|
self.data.append(obj['text']) |
|
|
|
def load_embedding_by_link(self): |
|
self.embedding = [] |
|
|
|
with jsonlines.open(self.embedding_link) as reader: |
|
for obj in reader: |
|
self.embedding.append(obj) |
|
|
|
def save_embeddings(self): |
|
|
|
with jsonlines.open(self.add_suffix_to_json_filename(self.data_link), mode='w') as writer: |
|
writer.write_all(self.embedding) |
|
|
|
async def calculate_embedding(self): |
|
result = await self.openai_embed.process_batch(self.data,retry=3) |
|
self.embedding = [emb["data"][0]["embedding"] for emb in result] |
|
self.save_embeddings() |
|
|
|
async def search(self, query): |
|
result = await self.openai_embed.create_embedding(query) |
|
query_embed = result["data"][0]["embedding"] |
|
dot_product = np.dot(self.embedding, query_embed) |
|
sorted_indices = np.argsort(dot_product)[::-1] |
|
top_k_indices = sorted_indices[:self.snippet_cnt] |
|
return [{"content":self.data[i],"source":"local"} for i in top_k_indices] |
|
|
|
|
|
async def run(self, queries): |
|
flattened_queries = [] |
|
for sublist in queries: |
|
if sublist is None: |
|
sublist = ['None', 'None'] |
|
for item in sublist: |
|
flattened_queries.append(item) |
|
|
|
snippets = await asyncio.gather(*[self.search(query) for query in flattened_queries]) |
|
snippets_split = [snippets[i] + snippets[i+1] for i in range(0, len(snippets), 2)] |
|
return snippets_split |
|
|
|
|
|
|
|
|
|
|
|
|
|
|