nsarrazin HF staff commited on
Commit
e3af794
·
unverified ·
1 Parent(s): 4eb33c8

Option for running websearch locally (#563)

Browse files
.env CHANGED
@@ -14,6 +14,7 @@ OPENAI_API_KEY=#your openai api key here
14
  YDC_API_KEY=#your docs.you.com api key here
15
  SERPER_API_KEY=#your serper.dev api key here
16
  SERPAPI_KEY=#your serpapi key here
 
17
 
18
  # Parameters to enable open id login
19
  OPENID_CONFIG=`{
 
14
  YDC_API_KEY=#your docs.you.com api key here
15
  SERPER_API_KEY=#your serper.dev api key here
16
  SERPAPI_KEY=#your serpapi key here
17
+ USE_LOCAL_WEBSEARCH=#set to true to parse google results yourself, overrides other API keys
18
 
19
  # Parameters to enable open id login
20
  OPENID_CONFIG=`{
src/lib/server/websearch/runWebSearch.ts CHANGED
@@ -15,6 +15,8 @@ import { getWebSearchProvider } from "./searchWeb";
15
  const MAX_N_PAGES_SCRAPE = 10 as const;
16
  const MAX_N_PAGES_EMBED = 5 as const;
17
 
 
 
18
  export async function runWebSearch(
19
  conv: Conversation,
20
  prompt: string,
@@ -45,14 +47,14 @@ export async function runWebSearch(
45
  const results = await searchWeb(webSearch.searchQuery);
46
  webSearch.results =
47
  (results.organic_results &&
48
- results.organic_results.map((el: { title: string; link: string; text?: string }) => {
49
  const { title, link, text } = el;
50
  const { hostname } = new URL(link);
51
  return { title, link, hostname, text };
52
  })) ??
53
  [];
54
  webSearch.results = webSearch.results
55
- .filter(({ link }) => !link.includes("youtube.com")) // filter out youtube links
56
  .slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
57
 
58
  let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
 
15
  const MAX_N_PAGES_SCRAPE = 10 as const;
16
  const MAX_N_PAGES_EMBED = 5 as const;
17
 
18
+ const DOMAIN_BLOCKLIST = ["youtube.com", "twitter.com"];
19
+
20
  export async function runWebSearch(
21
  conv: Conversation,
22
  prompt: string,
 
47
  const results = await searchWeb(webSearch.searchQuery);
48
  webSearch.results =
49
  (results.organic_results &&
50
+ results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
51
  const { title, link, text } = el;
52
  const { hostname } = new URL(link);
53
  return { title, link, hostname, text };
54
  })) ??
55
  [];
56
  webSearch.results = webSearch.results
57
+ .filter(({ link }) => !DOMAIN_BLOCKLIST.some((el) => link.includes(el))) // filter out blocklist links
58
  .slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
59
 
60
  let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
src/lib/server/websearch/searchWeb.ts CHANGED
@@ -1,8 +1,9 @@
1
  import type { YouWebSearch } from "../../types/WebSearch";
2
  import { WebSearchProvider } from "../../types/WebSearch";
3
- import { SERPAPI_KEY, SERPER_API_KEY, YDC_API_KEY } from "$env/static/private";
4
  import { getJson } from "serpapi";
5
  import type { GoogleParameters } from "serpapi";
 
6
 
7
  // get which SERP api is providing web results
8
  export function getWebSearchProvider() {
@@ -11,6 +12,9 @@ export function getWebSearchProvider() {
11
 
12
  // Show result as JSON
13
  export async function searchWeb(query: string) {
 
 
 
14
  if (SERPER_API_KEY) {
15
  return await searchWebSerper(query);
16
  }
 
1
  import type { YouWebSearch } from "../../types/WebSearch";
2
  import { WebSearchProvider } from "../../types/WebSearch";
3
+ import { SERPAPI_KEY, SERPER_API_KEY, USE_LOCAL_WEBSEARCH, YDC_API_KEY } from "$env/static/private";
4
  import { getJson } from "serpapi";
5
  import type { GoogleParameters } from "serpapi";
6
+ import { searchWebLocal } from "./searchWebLocal";
7
 
8
  // get which SERP api is providing web results
9
  export function getWebSearchProvider() {
 
12
 
13
  // Show result as JSON
14
  export async function searchWeb(query: string) {
15
+ if (USE_LOCAL_WEBSEARCH) {
16
+ return await searchWebLocal(query);
17
+ }
18
  if (SERPER_API_KEY) {
19
  return await searchWebSerper(query);
20
  }
src/lib/server/websearch/searchWebLocal.ts ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { JSDOM, VirtualConsole } from "jsdom";
2
+
3
+ export async function searchWebLocal(query: string) {
4
+ const abortController = new AbortController();
5
+ setTimeout(() => abortController.abort(), 10000);
6
+
7
+ const htmlString = await fetch("https://www.google.com/search?hl=en&q=" + query, {
8
+ signal: abortController.signal,
9
+ })
10
+ .then((response) => response.text())
11
+ .catch();
12
+
13
+ const virtualConsole = new VirtualConsole();
14
+
15
+ virtualConsole.on("error", () => {
16
+ // No-op to skip console errors.
17
+ });
18
+
19
+ // put the html string into a DOM
20
+ const dom = new JSDOM(htmlString ?? "", {
21
+ virtualConsole,
22
+ });
23
+
24
+ const { document } = dom.window;
25
+ // get all a documents with href tag
26
+
27
+ const links = document.querySelectorAll("a");
28
+
29
+ if (!links.length) {
30
+ throw new Error(`webpage doesn't have any "a" element`);
31
+ }
32
+
33
+ // take url that start wirth /url?q=
34
+ // and do not contain google.com links
35
+ // and strip them up to '&sa='
36
+ const linksHref = Array.from(links)
37
+ .filter((el) => el.href?.startsWith("/url?q=") && !el.href.includes("google.com/"))
38
+ .map((el) => {
39
+ const link = el.href;
40
+ return link.slice("/url?q=".length, link.indexOf("&sa="));
41
+ });
42
+
43
+ // remove duplicate links and map links to the correct object shape
44
+ return { organic_results: [...new Set(linksHref)].map((link) => ({ link })) };
45
+ }