|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from googlesearch import search
|
|
from duckduckgo_search import DDGS
|
|
import concurrent.futures
|
|
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_duckduckgo(query):
|
|
print("Fetching Duckduckgo Links -----")
|
|
try:
|
|
results = DDGS().text(f"{query} manual filetype:pdf", max_results=5)
|
|
return [res['href'] for res in results]
|
|
except:
|
|
return []
|
|
|
|
|
|
def search_google(query):
|
|
print("Fetching Google Links -----")
|
|
|
|
links = []
|
|
try:
|
|
api_key = 'AIzaSyDV_uJwrgNtawqtl6GDfeUj6NqO-H1tA4c'
|
|
search_engine_id = 'c4ca951b9fc6949cb'
|
|
|
|
url = f"https://www.googleapis.com/customsearch/v1"
|
|
params = {
|
|
"key": api_key,
|
|
"cx": search_engine_id,
|
|
"q": query + " manual filetype:pdf"
|
|
}
|
|
|
|
response = requests.get(url, params=params)
|
|
results = response.json()
|
|
|
|
for item in results.get('items', []):
|
|
links.append(item['link'])
|
|
except:
|
|
pass
|
|
|
|
try:
|
|
extension = "ext:pdf"
|
|
for result in search(query + " manual " + extension, num_results=5):
|
|
if result.endswith('.pdf'):
|
|
links.append(result)
|
|
except:
|
|
pass
|
|
|
|
return links
|
|
|
|
|
|
def search_archive(query):
|
|
print("Fetching Archieve Links -----")
|
|
|
|
try:
|
|
url = "https://archive.org/advancedsearch.php"
|
|
params = {
|
|
'q': f'{query} manual',
|
|
'fl[]': ['identifier', 'title', 'format'],
|
|
'rows': 50,
|
|
'page': 1,
|
|
'output': 'json'
|
|
}
|
|
|
|
|
|
response = requests.get(url, params=params)
|
|
data = response.json()
|
|
|
|
|
|
def extract_hyperlinks(url):
|
|
|
|
response = requests.get(url)
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
for link in soup.find_all('a', href=True):
|
|
href = link['href']
|
|
if href.endswith('.pdf'):
|
|
pdf_files.append(url+'/'+href)
|
|
if href.endswith('.iso'):
|
|
|
|
extract_pdf_from_iso(url+'/'+href+'/')
|
|
|
|
|
|
def extract_pdf_from_iso(iso_url):
|
|
|
|
iso_response = requests.get(iso_url)
|
|
|
|
|
|
if iso_response.status_code == 200:
|
|
|
|
iso_soup = BeautifulSoup(iso_response.text, 'html.parser')
|
|
|
|
|
|
for link in iso_soup.find_all('a', href=True):
|
|
href = link['href']
|
|
if href.endswith('.pdf'):
|
|
pdf_files.append('https:'+href)
|
|
|
|
pdf_files = []
|
|
|
|
def process_doc(doc):
|
|
identifier = doc.get('identifier', 'N/A')
|
|
|
|
|
|
pdf_link = f"https://archive.org/download/{identifier}"
|
|
extract_hyperlinks(pdf_link)
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
futures = [executor.submit(process_doc, doc) for doc in data['response']['docs']]
|
|
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
try:
|
|
future.result()
|
|
except Exception as exc:
|
|
print(f'Generated an exception: {exc}')
|
|
|
|
|
|
return pdf_files
|
|
|
|
except:
|
|
return []
|
|
|
|
def search_github(query):
|
|
print("Fetching Github Links -----")
|
|
|
|
try:
|
|
|
|
url = f"https://api.github.com/search/code?q={query}+extension:md"
|
|
|
|
headers = {
|
|
'Authorization': 'Token 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'
|
|
}
|
|
|
|
|
|
response = requests.get(url,headers=headers)
|
|
data = response.json()
|
|
links = [item['html_url'] for item in data['items']]
|
|
|
|
return links
|
|
|
|
except:
|
|
return []
|
|
|
|
def search_wikipedia(product):
|
|
print("Fetching Duckduckgo Links -----")
|
|
|
|
api_url = "https://en.wikipedia.org/w/api.php"
|
|
params = {
|
|
"action": "opensearch",
|
|
"search": product,
|
|
"limit": 5,
|
|
"namespace": 0,
|
|
"format": "json"
|
|
}
|
|
|
|
try:
|
|
response = requests.get(api_url, params=params)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
if data and len(data) > 3 and len(data[3]) > 0:
|
|
return data[3]
|
|
else:
|
|
return []
|
|
|
|
except requests.RequestException as e:
|
|
print(f"An error occurred: {e}")
|
|
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_similar_products(query):
|
|
print(f"\nFetching similar items of -----> {query}")
|
|
results = DDGS().chat(f'{query} Similar Products')
|
|
|
|
pattern = r'^\d+\.\s(.+)$'
|
|
matches = re.findall(pattern, results, re.MULTILINE)
|
|
matches = [item.split(': ')[0] for item in matches]
|
|
return matches
|
|
|
|
|