Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

Earnings-Call-Analysis-Whisperer / app.py

nickmuchi

Update app.py

a1dfd05 about 2 years ago

raw

history blame

4.48 kB

	import whisper
	import os
	from pytube import YouTube
	import pandas as pd
	import plotly_express as px
	import nltk
	import plotly.graph_objects as go
	from optimum.onnxruntime import ORTModelForSequenceClassification
	from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
	from sentence_transformers import SentenceTransformer, CrossEncoder, util
	import streamlit as st

	nltk.download('punkt')

	from nltk import sent_tokenize


	st.set_page_config(
	page_title="Home",
	page_icon="📞",
	)

	st.sidebar.header("Home")

	auth_token = os.environ.get("auth_token")

	progress_bar = st.sidebar.progress(0)

	@st.experimental_singleton()
	def load_models():
	asr_model = whisper.load_model("small")
	q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
	q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
	cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')

	return asr_model, q_model, q_tokenizer, cross_encoder

	asr_model, q_model, q_tokenizer, cross_encoder = load_models()

	@st.experimental_memo(suppress_st_warning=True)
	def inference(link, upload):
	'''Convert Youtube video or Audio upload to text'''

	if validators.url(link):

	yt = YouTube(link)
	title = yt.title
	path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4")
	options = whisper.DecodingOptions(without_timestamps=True)
	results = asr_model.transcribe(path)

	return results, yt.title

	elif upload:
	results = asr_model.transcribe(upload)

	return results, "Transcribed Earnings Audio"

	@st.experimental_memo(suppress_st_warning=True)
	def sentiment_pipe(earnings_text):
	'''Determine the sentiment of the text'''

	remote_clx = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)

	earnings_sentiment = remote_clx(sent_tokenize(earnings_text))

	return earnings_sentiment


	def preprocess_plain_text(text,window_size=3):
	'''Preprocess text for semantic search'''

	text = text.encode("ascii", "ignore").decode() # unicode
	text = re.sub(r"https*\S+", " ", text) # url
	text = re.sub(r"@\S+", " ", text) # mentions
	text = re.sub(r"#\S+", " ", text) # hastags
	text = re.sub(r"\s{2,}", " ", text) # over spaces
	#text = re.sub("[^.,!?%$A-Za-z0-9]+", " ", text) # special characters except .,!?

	#break into lines and remove leading and trailing space on each
	lines = [line.strip() for line in text.splitlines()]

	# #break multi-headlines into a line each
	chunks = [phrase.strip() for line in lines for phrase in line.split(" ")]

	# # drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	## We split this article into paragraphs and then every paragraph into sentences
	paragraphs = []
	for paragraph in text.replace('\n',' ').split("\n\n"):
	if len(paragraph.strip()) > 0:
	paragraphs.append(sent_tokenize(paragraph.strip()))

	#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
	#Smaller value: Context from other sentences might get lost
	#Lager values: More context from the paragraph remains, but results are longer
	window_size = window_size
	passages = []
	for paragraph in paragraphs:
	for start_idx in range(0, len(paragraph), window_size):
	end_idx = min(start_idx+window_size, len(paragraph))
	passages.append(" ".join(paragraph[start_idx:end_idx]))

	print(f"Sentences: {sum([len(p) for p in paragraphs])}")
	print(f"Passages: {len(passages)}")

	return passages

	def display_df_as_table(model,top_k,score='score'):
	'''Display the df with text and scores as a table'''

	df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text'])
	df['Score'] = round(df['Score'],2)

	return df

	def make_spans(text,results):
	results_list = []
	for i in range(len(results)):
	results_list.append(results[i]['label'])
	facts_spans = []
	facts_spans = list(zip(sent_tokenizer(text),results_list))
	return facts_spans

	##Fiscal Sentiment by Sentence
	def fin_ext(text):
	results = remote_clx(sent_tokenizer(text))
	return make_spans(text,results)

	progress_bar.empty()