Nexus / app /api /langchain-tools /duckduckgo_search.ts
Severian's picture
Upload 269 files
f0499d2
raw
history blame
14.3 kB
import { decode } from "html-entities";
import { convert as htmlToText } from "html-to-text";
import { Tool } from "langchain/tools";
const SEARCH_REGEX =
/DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'/;
const IMAGES_REGEX =
/;DDG\.duckbar\.load\('images', ({"ads":.+"vqd":{".+":"\d-\d+-\d+"}})\);DDG\.duckbar\.load\('news/;
const NEWS_REGEX =
/;DDG\.duckbar\.load\('news', ({"ads":.+"vqd":{".+":"\d-\d+-\d+"}})\);DDG\.duckbar\.load\('videos/;
const VIDEOS_REGEX =
/;DDG\.duckbar\.load\('videos', ({"ads":.+"vqd":{".+":"\d-\d+-\d+"}})\);DDG\.duckbar\.loadModule\('related_searches/;
const RELATED_SEARCHES_REGEX =
/DDG\.duckbar\.loadModule\('related_searches', ({"ads":.+"vqd":{".+":"\d-\d+-\d+"}})\);DDG\.duckbar\.load\('products/;
const VQD_REGEX = /vqd=['"](\d+-\d+(?:-\d+)?)['"]/;
interface CallbackSearchResult {
/** Website description */
a: string;
/** Unknown */
ae: null;
/** ddg!bang information (ex. w Wikipedia en.wikipedia.org) */
b?: string;
/** URL */
c: string;
/** URL of some sort. */
d: string;
/** Class name associations. */
da?: string;
/** Unknown */
h: number;
/** Website hostname */
i: string;
/** Unknown */
k: null;
/** Unknown */
m: number;
/** Unknown */
o: number;
/** Unknown */
p: number;
/** Unknown */
s: string;
/** Website Title */
t: string;
/** Website URL */
u: string;
}
interface CallbackNextSearch {
/** URL to the next page of results */
n: string;
}
interface CallbackDuckbarPayload<T> {
ads: null | any[];
query: string;
queryEncoded: string;
response_type: string;
results: T[];
vqd: {
[query: string]: string;
};
}
interface DuckbarImageResult {
/** The height of the image in pixels. */
height: number;
/** The image URL. */
image: string;
/** The source of the image. */
source: string;
/** The thumbnail URL. */
thumbnail: string;
/** The title (or caption) of the image. */
title: string;
/** The website URL of where the image came from. */
url: string;
/** The width of the image in pixels. */
width: number;
}
interface DuckbarVideoResult {
/** URL of the video */
content: string;
/** Description of the video */
description: string;
/** Duration of the video */
duration: string;
/** Embed HTML for the video */
embed_html: string;
/** Embed URL for the video */
embed_url: string;
/** Thumbnail images of the video */
images: {
large: string;
medium: string;
motion: string;
small: string;
};
/** Where this search result came from */
provider: string;
/** ISO timestamp of the upload */
published: string;
/** What site the video was on */
publisher: string;
/** Various statistics */
statistics: {
/** View count of the video */
viewCount: number | null;
};
/** Title of the video */
title: string;
/** Name of the video uploader(?) */
uploader: string;
}
interface DuckbarRelatedSearch {
display_text: string;
text: string;
web_search_url: string;
}
interface DuckbarNewsResult {
date: number;
excerpt: string;
image?: string;
relative_time: string;
syndicate: string;
title: string;
url: string;
use_relevancy: number;
is_old?: number;
fetch_image?: number;
}
interface SearchResults {
/** Whether there were no results found. */
noResults: boolean;
/** The VQD of the search query. */
vqd: string;
/** The web results of the search. */
results: SearchResult[];
/** The image results of the search. */
images?: DuckbarImageResult[];
/** The news article results of the search. */
news?: NewsResult[];
/** The video results of the search. */
videos?: VideoResult[];
/** The related searches of the query. */
related?: RelatedResult[];
}
interface VideoResult {
/** The URL of the video. */
url: string;
/** The title of the video. */
title: string;
/** The description of the video. */
description: string;
/** The image URL of the video. */
image: string;
/** The duration of the video. (i.e. "9:20") */
duration: string;
/** The ISO timestamp of when the video was published. */
published: string;
/** Where the video was publised on. (i.e. "YouTube") */
publishedOn: string;
/** The name of who uploaded the video. */
publisher: string;
/** The view count of the video. */
viewCount?: number;
}
interface NewsResult {
/** The timestamp of when the article was created. */
date: number;
/** An except of the article. */
excerpt: string;
/** The image URL used in the article. */
image?: string;
/** The relative time of when the article was posted, in human readable format. */
relativeTime: string;
/** Where this article was indexed from. */
syndicate: string;
/** The title of the article. */
title: string;
/** The URL of the article. */
url: string;
/** Whether this article is classified as old. */
isOld: boolean;
}
interface SearchResult {
/** The hostname of the website. (i.e. "google.com") */
hostname: string;
/** The URL of the result. */
url: string;
/** The title of the result. */
title: string;
/**
* The sanitized description of the result.
* Bold tags will still be present in this string.
*/
description: string;
/** The description of the result. */
rawDescription: string;
/** The icon of the website. */
icon: string;
/** The ddg!bang information of the website, if any. */
bang?: SearchResultBang;
}
interface SearchResultBang {
/** The prefix of the bang. (i.e. "w" for !w) */
prefix: string;
/** The title of the bang. */
title: string;
/** The domain of the bang. */
domain: string;
}
interface RelatedResult {
text: string;
raw: string;
}
enum SearchTimeType {
/** From any time. */
ALL = "a",
/** From the past day. */
DAY = "d",
/** From the past week. */
WEEK = "w",
/** From the past month. */
MONTH = "m",
/** From the past year. */
YEAR = "y",
}
interface SearchOptions {
/** The safe search type of the search. */
safeSearch?: SafeSearchType;
/** The time range of the searches, can be a SearchTimeType or a date range ("2021-03-16..2021-03-30") */
time?: SearchTimeType | string;
/** The locale(?) of the search. Defaults to "en-us". */
locale?: string;
/** The region of the search. Defaults to "wt-wt" or all regions. */
region?: string;
/** The market region(?) of the search. Defaults to "US". */
marketRegion?: string;
/** The number to offset the results to. */
offset?: number;
/**
* The string that acts like a key to a search.
* Set this if you made a search with the same query.
*/
vqd?: string;
}
enum SafeSearchType {
/** Strict filtering, no NSFW content. */
STRICT = 0,
/** Moderate filtering. */
MODERATE = -1,
/** No filtering. */
OFF = -2,
}
const defaultOptions: SearchOptions = {
safeSearch: SafeSearchType.OFF,
time: SearchTimeType.ALL,
locale: "en-us",
region: "wt-wt",
offset: 0,
marketRegion: "us",
};
async function search(
query: string,
options?: SearchOptions,
): Promise<SearchResults> {
if (!query) throw new Error("Query cannot be empty!");
if (!options) options = defaultOptions;
else options = sanityCheck(options);
let vqd = options.vqd!;
if (!vqd) vqd = await getVQD(query, "web");
const queryObject: Record<string, string> = {
q: query,
...(options.safeSearch !== SafeSearchType.STRICT ? { t: "D" } : {}),
l: options.locale!,
...(options.safeSearch === SafeSearchType.STRICT ? { p: "1" } : {}),
kl: options.region || "wt-wt",
s: String(options.offset),
dl: "en",
ct: "US",
ss_mkt: options.marketRegion!,
df: options.time! as string,
vqd,
...(options.safeSearch !== SafeSearchType.STRICT
? { ex: String(options.safeSearch) }
: {}),
sp: "1",
bpa: "1",
biaexp: "b",
msvrtexp: "b",
...(options.safeSearch === SafeSearchType.STRICT
? {
videxp: "a",
nadse: "b",
eclsexp: "a",
stiaexp: "a",
tjsexp: "b",
related: "b",
msnexp: "a",
}
: {
nadse: "b",
eclsexp: "b",
tjsexp: "b",
// cdrexp: 'b'
}),
};
const response = await fetch(
`https://links.duckduckgo.com/d.js?${queryString(queryObject)}`,
);
const data = await response.text();
if (data.includes("DDG.deep.is506"))
throw new Error("A server error occurred!");
const searchResults = JSON.parse(
SEARCH_REGEX.exec(data)![1].replace(/\t/g, " "),
) as (CallbackSearchResult | CallbackNextSearch)[];
if (searchResults.length === 1 && !("n" in searchResults[0])) {
const onlyResult = searchResults[0] as CallbackSearchResult;
/* istanbul ignore next */
if (
(!onlyResult.da && onlyResult.t === "EOF") ||
!onlyResult.a ||
onlyResult.d === "google.com search"
)
return {
noResults: true,
vqd,
results: [],
};
}
const results: SearchResults = {
noResults: false,
vqd,
results: [],
};
for (const search of searchResults) {
if ("n" in search) continue;
let bang: SearchResultBang | undefined;
if (search.b) {
const [prefix, title, domain] = search.b.split("\t");
bang = { prefix, title, domain };
}
results.results.push({
title: search.t,
description: decode(search.a),
rawDescription: search.a,
hostname: search.i,
icon: `https://external-content.duckduckgo.com/ip3/${search.i}.ico`,
url: search.u,
bang,
});
}
// Images
const imagesMatch = IMAGES_REGEX.exec(data);
if (imagesMatch) {
const imagesResult = JSON.parse(
imagesMatch[1].replace(/\t/g, " "),
) as CallbackDuckbarPayload<DuckbarImageResult>;
results.images = imagesResult.results.map((i) => {
i.title = decode(i.title);
return i;
});
}
// News
const newsMatch = NEWS_REGEX.exec(data);
if (newsMatch) {
const newsResult = JSON.parse(
newsMatch[1].replace(/\t/g, " "),
) as CallbackDuckbarPayload<DuckbarNewsResult>;
results.news = newsResult.results.map((article) => ({
date: article.date,
excerpt: decode(article.excerpt),
image: article.image,
relativeTime: article.relative_time,
syndicate: article.syndicate,
title: decode(article.title),
url: article.url,
isOld: !!article.is_old,
})) as NewsResult[];
}
// Videos
const videosMatch = VIDEOS_REGEX.exec(data);
if (videosMatch) {
const videoResult = JSON.parse(
videosMatch[1].replace(/\t/g, " "),
) as CallbackDuckbarPayload<DuckbarVideoResult>;
results.videos = [];
/* istanbul ignore next */
for (const video of videoResult.results) {
results.videos.push({
url: video.content,
title: decode(video.title),
description: decode(video.description),
image:
video.images.large ||
video.images.medium ||
video.images.small ||
video.images.motion,
duration: video.duration,
publishedOn: video.publisher,
published: video.published,
publisher: video.uploader,
viewCount: video.statistics.viewCount || undefined,
});
}
}
// Related Searches
const relatedMatch = RELATED_SEARCHES_REGEX.exec(data);
if (relatedMatch) {
const relatedResult = JSON.parse(
relatedMatch[1].replace(/\t/g, " "),
) as CallbackDuckbarPayload<DuckbarRelatedSearch>;
results.related = [];
for (const related of relatedResult.results) {
results.related.push({
text: related.text,
raw: related.display_text,
});
}
}
return results;
}
function queryString(query: Record<string, string>) {
return new URLSearchParams(query).toString();
}
async function getVQD(query: string, ia = "web") {
try {
const response = await fetch(
`https://duckduckgo.com/?${queryString({ q: query, ia })}`,
);
const data = await response.text();
return VQD_REGEX.exec(data)![1];
} catch (e) {
throw new Error(`Failed to get the VQD for query "${query}".`);
}
}
function sanityCheck(options: SearchOptions) {
options = Object.assign({}, defaultOptions, options);
if (!(options.safeSearch! in SafeSearchType))
throw new TypeError(
`${options.safeSearch} is an invalid safe search type!`,
);
/* istanbul ignore next */
if (typeof options.safeSearch! === "string")
options.safeSearch = SafeSearchType[
options.safeSearch!
] as any as SafeSearchType;
if (typeof options.offset !== "number")
throw new TypeError(`Search offset is not a number!`);
if (options.offset! < 0)
throw new RangeError("Search offset cannot be below zero!");
if (
options.time &&
!Object.values(SearchTimeType).includes(options.time as SearchTimeType) &&
!/\d{4}-\d{2}-\d{2}..\d{4}-\d{2}-\d{2}/.test(options.time as string)
)
throw new TypeError(`${options.time} is an invalid search time!`);
if (!options.locale || typeof options.locale! !== "string")
throw new TypeError("Search locale must be a string!");
if (!options.region || typeof options.region! !== "string")
throw new TypeError("Search region must be a string!");
if (!options.marketRegion || typeof options.marketRegion! !== "string")
throw new TypeError("Search market region must be a string!");
if (options.vqd && !/\d-\d+-\d+/.test(options.vqd))
throw new Error(`${options.vqd} is an invalid VQD!`);
return options;
}
export class DuckDuckGo extends Tool {
name = "duckduckgo_search";
maxResults = 4;
/** @ignore */
async _call(input: string) {
const searchResults = await search(input, {
safeSearch: SafeSearchType.OFF,
});
if (searchResults.noResults) {
return "No good search result found";
}
const results = searchResults.results
.slice(0, this.maxResults)
.map(({ title, description, url }) => htmlToText(description))
.join("\n\n");
return results;
}
description =
"a search engine. useful for when you need to answer questions about current events. input should be a search query.";
}