Spaces:
Sleeping
Sleeping
import argparse | |
import os | |
import sys | |
import uvicorn | |
from fastapi import FastAPI, Depends | |
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials | |
from pydantic import BaseModel, Field | |
from typing import Union | |
from sse_starlette.sse import EventSourceResponse, ServerSentEvent | |
from utils.logger import logger | |
from networks.google_searcher import GoogleSearcher | |
from networks.html_fetcher import HTMLFetcher | |
from documents.query_results_extractor import QueryResultsExtractor | |
from documents.webpage_content_extractor import WebpageContentExtractor | |
from utils.logger import logger | |
class SearchAPIApp: | |
def __init__(self): | |
self.app = FastAPI( | |
docs_url="/", | |
title="Web Search API", | |
swagger_ui_parameters={"defaultModelsExpandDepth": -1}, | |
version="1.0", | |
) | |
self.setup_routes() | |
class QueriesToSearchResultsPostItem(BaseModel): | |
queries: list = Field( | |
default=[""], | |
description="(list[str]) Queries to search", | |
) | |
result_num: int = Field( | |
default=10, | |
description="(int) Number of search results", | |
) | |
safe: bool = Field( | |
default=False, | |
description="(bool) Enable SafeSearch", | |
) | |
types: list = Field( | |
default=["web"], | |
description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`", | |
) | |
extract_content: bool = Field( | |
default=False, | |
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict", | |
) | |
def queries_to_search_results(self, item: QueriesToSearchResultsPostItem): | |
google_searcher = GoogleSearcher() | |
query_results_extractor = QueryResultsExtractor() | |
queries_search_results = [] | |
for query in item.queries: | |
if not query.strip(): | |
continue | |
query_html_path = google_searcher.search( | |
query=query, result_num=item.result_num, safe=item.safe | |
) | |
query_search_results = query_results_extractor.extract(query_html_path) | |
queries_search_results.append(query_search_results) | |
logger.note(queries_search_results) | |
if item.extract_content: | |
html_fetcher = HTMLFetcher() | |
webpage_content_extractor = WebpageContentExtractor() | |
for query_idx, query_search_result in enumerate(queries_search_results): | |
for query_result_idx, query_result in enumerate( | |
query_search_result["query_results"] | |
): | |
html_path = html_fetcher.fetch(query_result["url"]) | |
extracted_content = webpage_content_extractor.extract(html_path) | |
queries_search_results[query_idx]["query_results"][ | |
query_result_idx | |
]["text"] = extracted_content | |
return queries_search_results | |
def setup_routes(self): | |
self.app.post( | |
"/queries_to_search_results", | |
summary="Search queries, and extract contents from results", | |
)(self.queries_to_search_results) | |
class ArgParser(argparse.ArgumentParser): | |
def __init__(self, *args, **kwargs): | |
super(ArgParser, self).__init__(*args, **kwargs) | |
self.add_argument( | |
"-s", | |
"--server", | |
type=str, | |
default="0.0.0.0", | |
help="Server IP for Web Search API", | |
) | |
self.add_argument( | |
"-p", | |
"--port", | |
type=int, | |
default=21111, | |
help="Server Port for Web Search API", | |
) | |
self.add_argument( | |
"-d", | |
"--dev", | |
default=False, | |
action="store_true", | |
help="Run in dev mode", | |
) | |
self.args = self.parse_args(sys.argv[1:]) | |
app = SearchAPIApp().app | |
if __name__ == "__main__": | |
args = ArgParser().args | |
if args.dev: | |
uvicorn.run("__main__:app", host=args.server, port=args.port, reload=True) | |
else: | |
uvicorn.run("__main__:app", host=args.server, port=args.port, reload=False) | |
# python -m apis.search_api # [Docker] in product mode | |
# python -m apis.search_api -d # [Dev] in develop mode | |