Spaces:

Hamza1702
/

SentimentAnalysis

Runtime error

App Files Files Community

Hamza1702 commited on Sep 21, 2023

Commit

de6262a

•

1 Parent(s): 8cb066e

Create utils.py

Browse files

Files changed (1) hide show

utils.py +433 -0

utils.py ADDED Viewed

	@@ -0,0 +1,433 @@

+"""
+general utility functions for loading, saving, and manipulating data
+"""
+import os
+import logging
+import pprint as pp
+import re
+import shutil # zipfile formats
+import warnings
+from datetime import datetime
+from os.path import basename, getsize, join
+from pathlib import Path
+import logging
+import pandas as pd
+import requests
+from natsort import natsorted
+from symspellpy import SymSpell
+from tqdm.auto import tqdm
+import warnings
+warnings.filterwarnings(
+ action="ignore", message=".*the GPL-licensed package `unidecode` is not installed*"
+) # cleantext GPL-licensed package reminder is annoying
+class DisableLogger:
+ def __enter__(self):
+ logging.disable(logging.CRITICAL)
+ def __exit__(self, exit_type, exit_value, exit_traceback):
+ logging.disable(logging.NOTSET)
+with DisableLogger():
+ from cleantext import clean
+def clear_loggers():
+ for handler in logging.root.handlers[:]:
+ logging.root.removeHandler(handler)
+def get_timestamp():
+ return datetime.now().strftime("%b-%d-%Y_t-%H")
+def print_spacer(n=1):
+ """print_spacer - print a spacer line"""
+ print("\n -------- " * n)
+def remove_trailing_punctuation(text: str):
+ """
+ remove_trailing_punctuation - remove trailing punctuation from a string
+ Args:
+ text (str): [string to be cleaned]
+ Returns:
+ [str]: [cleaned string]
+ """
+ return text.strip("?!.,;:")
+def correct_phrase_load(my_string: str):
+ """
+ correct_phrase_load [basic / unoptimized implementation of SymSpell to correct a string]
+ Args:
+ my_string (str): [text to be corrected]
+ Returns:
+ str: the corrected string
+ """
+ sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
+ dictionary_path = (
+ r"symspell_rsc/frequency_dictionary_en_82_765.txt" # from repo root
+ )
+ bigram_path = (
+ r"symspell_rsc/frequency_bigramdictionary_en_243_342.txt" # from repo root
+ )
+ # term_index is the column of the term and count_index is the
+ # column of the term frequency
+ sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
+ sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
+ # max edit distance per lookup (per single word, not per whole input string)
+ suggestions = sym_spell.lookup_compound(
+ clean(my_string), max_edit_distance=2, ignore_non_words=True
+ )
+ if len(suggestions) < 1:
+ return my_string
+ else:
+ first_result = suggestions[0]
+ return first_result._term
+def fast_scandir(dirname: str):
+ """
+ fast_scandir [an os.path-based means to return all subfolders in a given filepath]
+ """
+ subfolders = [f.path for f in os.scandir(dirname) if f.is_dir()]
+ for dirname in list(subfolders):
+ subfolders.extend(fast_scandir(dirname))
+ return subfolders # list
+def create_folder(directory: str):
+ os.makedirs(directory, exist_ok=True)
+def chunks(lst: list, n: int):
+ """
+ chunks - Yield successive n-sized chunks from lst
+ Args: lst (list): list to be chunked
+ n (int): size of chunks
+ """
+ for i in range(0, len(lst), n):
+ yield lst[i : i + n]
+def shorten_list(
+ list_of_strings: list, max_chars: int = 512, no_blanks=True, verbose=False
+):
+ """a helper function that iterates through a list backwards, adding to a new list.
+ When <max_chars> is met, that list entry is not added.
+ Args:
+ list_of_strings (list): list of strings to be shortened
+ max_chars (int, optional): maximum number of characters in a the list in total. Defaults to 512.
+ no_blanks (bool, optional): if True, blank strings are not added to the new list. Defaults to True.
+ verbose (bool, optional): if True, print the list of strings before and after the shorten. Defaults to False.
+ """
+ list_of_strings = [
+ str(x) for x in list_of_strings
+ ] # convert to strings if not already
+ shortened_list = []
+ total_len = 0
+ for i, string in enumerate(list_of_strings[::-1], start=1):
+ if len(string.strip()) == 0 and no_blanks:
+ continue
+ if len(string) + total_len >= max_chars:
+ logging.info(f"string # {i} puts total over limit, breaking ")
+ break
+ total_len += len(string)
+ shortened_list.insert(0, string)
+ if len(shortened_list) == 0:
+ logging.info(f"shortened list with max_chars={max_chars} has no entries")
+ if verbose:
+ print(f"total length of list is {total_len} chars")
+ return shortened_list
+def chunky_pandas(my_df, num_chunks: int = 4):
+ """
+ chunky_pandas [split dataframe into `num_chunks` equal chunks, return each inside a list]
+ Args:
+ my_df (pd.DataFrame)
+ num_chunks (int, optional): Defaults to 4.
+ Returns:
+ list: a list of dataframes
+ """
+ n = int(len(my_df) // num_chunks)
+ list_df = [my_df[i : i + n] for i in range(0, my_df.shape[0], n)]
+ return list_df
+def load_dir_files(
+ directory: str, req_extension=".txt", return_type="list", verbose=False
+):
+ """
+ load_dir_files - an os.path based method of returning all files with extension `req_extension` in a given directory and subdirectories
+ Args:
+ Returns:
+ list or dict: an iterable of filepaths or a dict of filepaths and their respective filenames
+ """
+ appr_files = []
+ # r=root, d=directories, f = files
+ for r, d, f in os.walk(directory):
+ for prefile in f:
+ if prefile.endswith(req_extension):
+ fullpath = os.path.join(r, prefile)
+ appr_files.append(fullpath)
+ appr_files = natsorted(appr_files)
+ if verbose:
+ print("A list of files in the {} directory are: \n".format(directory))
+ if len(appr_files) < 10:
+ pp.pprint(appr_files)
+ else:
+ pp.pprint(appr_files[:10])
+ print("\n and more. There are a total of {} files".format(len(appr_files)))
+ if return_type.lower() == "list":
+ return appr_files
+ else:
+ if verbose:
+ print("returning dictionary")
+ appr_file_dict = {}
+ for this_file in appr_files:
+ appr_file_dict[basename(this_file)] = this_file
+ return appr_file_dict
+def URL_string_filter(text):
+ """
+ URL_string_filter - filter out nonstandard "text" characters
+ """
+ custom_printable = (
+ "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ._"
+ )
+ filtered = "".join((filter(lambda i: i in custom_printable, text)))
+ return filtered
+def getFilename_fromCd(cd):
+ """getFilename_fromCd - get the filename from a given cd str"""
+ if not cd:
+ return None
+ fname = re.findall("filename=(.+)", cd)
+ if len(fname) > 0:
+ output = fname[0]
+ elif cd.find("/"):
+ possible_fname = cd.rsplit("/", 1)[1]
+ output = URL_string_filter(possible_fname)
+ else:
+ output = None
+ return output
+def get_zip_URL(
+ URLtoget: str,
+ extract_loc: str = None,
+ file_header: str = "dropboxexport_",
+ verbose: bool = False,
+):
+ """get_zip_URL - download a zip file from a given URL and extract it to a given location"""
+ r = requests.get(URLtoget, allow_redirects=True)
+ names = getFilename_fromCd(r.headers.get("content-disposition"))
+ fixed_fnames = names.split(";") # split the multiple results
+ this_filename = file_header + URL_string_filter(fixed_fnames[0])
+ # define paths and save the zip file
+ if extract_loc is None:
+ extract_loc = "dropbox_dl"
+ dl_place = join(os.getcwd(), extract_loc)
+ create_folder(dl_place)
+ save_loc = join(os.getcwd(), this_filename)
+ open(save_loc, "wb").write(r.content)
+ if verbose:
+ print("downloaded file size was {} MB".format(getsize(save_loc) / 1000000))
+ # unpack the archive
+ shutil.unpack_archive(save_loc, extract_dir=dl_place)
+ if verbose:
+ print("extracted zip file - ", datetime.now())
+ x = load_dir_files(dl_place, req_extension="", verbose=verbose)
+ # remove original
+ try:
+ os.remove(save_loc)
+ del save_loc
+ except Exception:
+ print("unable to delete original zipfile - check if exists", datetime.now())
+ print("finished extracting zip - ", datetime.now())
+ return dl_place
+def merge_dataframes(data_dir: str, ext=".xlsx", verbose=False):
+ """
+ merge_dataframes - given a filepath, loads and attempts to merge all files as dataframes
+ Args:
+ data_dir (str): [root directory to search in]
+ ext (str, optional): [anticipate file extension for the dataframes ]. Defaults to '.xlsx'.
+ Returns:
+ pd.DataFrame(): merged dataframe of all files
+ """
+ src = Path(data_dir)
+ src_str = str(src.resolve())
+ mrg_df = pd.DataFrame()
+ all_reports = load_dir_files(directory=src_str, req_extension=ext, verbose=verbose)
+ failed = []
+ for df_path in tqdm(all_reports, total=len(all_reports), desc="joining data..."):
+ try:
+ this_df = pd.read_excel(df_path).convert_dtypes()
+ mrg_df = pd.concat([mrg_df, this_df], axis=0)
+ except Exception:
+ short_p = os.path.basename(df_path)
+ print(
+ f"WARNING - file with extension {ext} and name {short_p} could not be read."
+ )
+ failed.append(short_p)
+ if len(failed) > 0:
+ print("failed to merge {} files, investigate as needed")
+ if verbose:
+ pp.pprint(mrg_df.info(True))
+ return mrg_df
+def download_URL(url: str, file=None, dlpath=None, verbose=False):
+ """
+ download_URL - download a file from a URL and show progress bar
+ Parameters
+ ----------
+ url : str, URL to download
+ file : str, optional, default None, name of file to save to. If None, will use the filename from the URL
+ dlpath : str, optional, default None, path to save the file to. If None, will save to the current working directory
+ verbose : bool, optional, default False, print progress bar
+ Returns
+ -------
+ str - path to the downloaded file
+ """
+ if file is None:
+ if "?dl=" in url:
+ # is a dropbox link
+ prefile = url.split("/")[-1]
+ filename = str(prefile).split("?dl=")[0]
+ else:
+ filename = url.split("/")[-1]
+ file = clean(filename)
+ if dlpath is None:
+ dlpath = Path.cwd() # save to current working directory
+ else:
+ dlpath = Path(dlpath) # make a path object
+ r = requests.get(url, stream=True, allow_redirects=True)
+ total_size = int(r.headers.get("content-length"))
+ initial_pos = 0
+ dl_loc = dlpath / file
+ with open(str(dl_loc.resolve()), "wb") as f:
+ with tqdm(
+ total=total_size,
+ unit="B",
+ unit_scale=True,
+ desc=file,
+ initial=initial_pos,
+ ascii=True,
+ ) as pbar:
+ for ch in r.iter_content(chunk_size=1024):
+ if ch:
+ f.write(ch)
+ pbar.update(len(ch))
+ if verbose:
+ print(f"\ndownloaded {file} to {dlpath}\n")
+ return str(dl_loc.resolve())
+def dl_extract_zip(
+ URLtoget: str,
+ extract_loc: str = None,
+ file_header: str = "TEMP_archive_dl_",
+ verbose: bool = False,
+):
+ """
+ dl_extract_zip - generic function to download a zip file and extract it
+ Parameters
+ ----------
+ URLtoget : str, zip file URL to download
+ extract_loc : str, optional, default None, path to save the zip file to. If None, will save to the current working directory
+ file_header : str, optional, default 'TEMP_archive_dl_', prefix for the zip file name
+ verbose : bool, optional, default False, print progress bar
+ Returns
+ -------
+ str - path to the downloaded and extracted folder
+ """
+ extract_loc = Path(extract_loc)
+ extract_loc.mkdir(parents=True, exist_ok=True)
+ save_loc = download_URL(
+ url=URLtoget, file=f"{file_header}.zip", dlpath=None, verbose=verbose
+ )
+ shutil.unpack_archive(save_loc, extract_dir=extract_loc)
+ if verbose:
+ print("extracted zip file - ", datetime.now())
+ x = load_dir_files(extract_loc, req_extension="", verbose=verbose)
+ # remove original
+ try:
+ os.remove(save_loc)
+ del save_loc
+ except Exception as e:
+ warnings.warn(message=f"unable to delete original zipfile due to {e}")
+ if verbose:
+ print("finished extracting zip - ", datetime.now())
+ return extract_loc