from concurrent.futures import ALL_COMPLETED import json import datetime import requests import math import pandas as pd import urllib.parse import concurrent.futures import logging HISTORICAL_DATA_URL = 'https://www.nseindia.com/api/historical/cm/equity?series=[%22EQ%22]&' BASE_URL = 'https://www.nseindia.com/' CORPORATE_EVENTS_URL = 'https://www.nseindia.com/api/corporate-announcements?' logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s:%(message)s') def get_headers(): return { "Host": "www1.nseindia.com", "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0", "Accept": "*/*", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate, br", "X-Requested-With": "XMLHttpRequest", "Referer": "https://www1.nseindia.com/products/content/equities/equities/eq_security.htm", "Access-Control-Allow-Origin": "*", "Access-Control-Allow-Methods": "GET,POST,PUT,DELETE,OPTIONS", "Access-Control-Allow-Headers": "Content-Type, Access-Control-Allow-Headers, Authorization, X-Requested-With", 'Content-Type': 'application/start_date-www-form-urlencoded; charset=UTF-8' } def get_adjusted_headers(): return { 'Host': 'www.nseindia.com', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:85.0) Gecko/20100101 Firefox/85.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'X-Requested-With': 'XMLHttpRequest', 'DNT': '1', 'Connection': 'keep-alive', } def fetch_cookies(): response = requests.get(BASE_URL, timeout=30, headers=get_adjusted_headers()) if response.status_code != requests.codes.ok: logging.error("Fetched url: %s with status code: %s and response from server: %s" % ( BASE_URL, response.status_code, response.content)) raise ValueError("Please try again in a minute.") return response.cookies.get_dict() def fetch_url(url, cookies): """ This is the function call made by each thread. A get request is made for given start and end date, response is parsed and dataframe is returned """ response = requests.get(url, timeout=30, headers=get_adjusted_headers(), cookies=cookies) if response.status_code == requests.codes.ok: json_response = json.loads(response.content) return pd.DataFrame.from_dict(json_response['data']) else: logging.error("Fetched url: %s with status code: %s and response from server: %s" % ( BASE_URL, response.status_code, response.content)) raise ValueError("Please try again in a minute.") def scrape_data(start_date, end_date, input_type, name): """ Called by stocks and indices to scrape data. Create threads for different requests, parses data, combines them and returns dataframe Args: start_date (datetime.datetime): start date end_date (datetime.datetime): end date input_type (str): Either 'stock' or 'index' name (str, optional): stock symbol or index name. Defaults to None. Returns: Pandas DataFrame: df containing data for stocksymbol for provided date range """ stage, total_stages = 0, math.ceil((end_date - start_date).days / 50) threads, url_list = [], [] cookies = fetch_cookies() for stage in range(total_stages): fetch_end_date = end_date - stage * datetime.timedelta(days=50) fetch_start_date = fetch_end_date - datetime.timedelta(days=50) if input_type == 'stock': params = {'symbol': name, 'from': fetch_start_date.strftime("%d-%m-%Y"), 'to': fetch_end_date.strftime("%d-%m-%Y")} url = HISTORICAL_DATA_URL + urllib.parse.urlencode(params) url_list.append(url) result = pd.DataFrame() with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_url = {executor.submit(fetch_url, url, cookies): url for url in url_list} concurrent.futures.wait(future_to_url, return_when=ALL_COMPLETED) for future in concurrent.futures.as_completed(future_to_url): url = future_to_url[future] try: df = future.result() result = pd.concat([result, df]) except Exception as exc: logging.error('%r generated an exception: %s. Please try again later.' % (url, exc)) raise exc return format_dataframe_result(result) def format_dataframe_result(result): columns_required = ["TIMESTAMP", "CH_SYMBOL", "CH_SERIES", "CH_TRADE_HIGH_PRICE", "CH_TRADE_LOW_PRICE", "CH_OPENING_PRICE", "CH_CLOSING_PRICE", "CH_LAST_TRADED_PRICE", "CH_PREVIOUS_CLS_PRICE", "CH_TOT_TRADED_QTY", "CH_TOT_TRADED_VAL", "CH_52WEEK_HIGH_PRICE", "CH_52WEEK_LOW_PRICE"] result = result[columns_required] result = result.set_axis( ['Date', 'Symbol', 'Series', 'High Price', 'Low Price', 'Open Price', 'Close Price', 'Last Price', 'Prev Close Price', 'Total Traded Quantity', 'Total Traded Value', '52 Week High Price', '52 Week Low Price'], axis=1) result.set_index('Date', inplace=True) result.sort_index(inplace=True) return result