Spaces:

AIZero2HeroBootcamp
/

TranscriptAILearnerFromYoutube

Sleeping

App Files Files Community

TranscriptAILearnerFromYoutube / app.py

awacke1

Upload 4 files

f39338a almost 2 years ago

raw

history blame contribute delete

6.73 kB


	import streamlit as st
	import re
	import json
	import nltk
	from nltk.corpus import stopwords
	from nltk import FreqDist
	from graphviz import Digraph
	from collections import Counter

	nltk.download('punkt')
	nltk.download('stopwords')

	def remove_timestamps(text):
	return re.sub(r'\d{1,2}:\d{2}\n', '', text)

	def process_text(text):
	lines = text.split("\n")
	processed_lines = []

	for line in lines:
	if line:
	processed_lines.append(line)

	outline = ""
	for i, line in enumerate(processed_lines):
	if i % 2 == 0:
	outline += f"{line}\n"
	else:
	outline += f"- {line} 😄\n"

	return outline

	def create_jsonl_list(text):
	lines = text.split("\n")
	jsonl_list = []

	for line in lines:
	if line:
	jsonl_list.append({"text": line})

	return jsonl_list

	def unit_test(input_text):
	st.write("Test Text without Timestamps:")
	test_text_without_timestamps = remove_timestamps(input_text)
	st.write(test_text_without_timestamps)

	st.write("Test JSONL List:")
	test_jsonl_list = create_jsonl_list(test_text_without_timestamps)
	st.write(test_jsonl_list)



	def extract_high_information_words(text, top_n=10):
	words = nltk.word_tokenize(text)
	words = [word.lower() for word in words if word.isalpha()]

	stop_words = set(stopwords.words('english'))
	filtered_words = [word for word in words if word not in stop_words]

	freq_dist = FreqDist(filtered_words)
	high_information_words = [word for word, _ in freq_dist.most_common(top_n)]

	return high_information_words


	def create_relationship_graph(words):
	graph = Digraph()

	for index, word in enumerate(words):
	graph.node(str(index), word)

	if index > 0:
	graph.edge(str(index - 1), str(index), label=str(index))

	return graph


	def display_relationship_graph(words):
	graph = create_relationship_graph(words)
	st.graphviz_chart(graph)




	text_input = st.text_area("Enter text:", value="", height=300)
	text_without_timestamps = remove_timestamps(text_input)

	st.markdown("Text without Timestamps:")
	st.write(text_without_timestamps)

	processed_text = process_text(text_without_timestamps)
	st.markdown("Markdown Outline with Emojis:")
	st.markdown(processed_text)

	unit_test_text = '''
	1:42
	program the does very very well on your data then you will achieve the best
	1:48
	generalization possible with a little bit of modification you can turn it into a precise theorem
	1:54
	and on a very intuitive level it's easy to see what it should be the case if you
	2:01
	have some data and you're able to find a shorter program which generates this
	2:06
	data then you've essentially extracted all the all conceivable regularity from
	2:11
	this data into your program and then you can use these objects to make the best predictions possible like if if you have
	2:19
	data which is so complex but there is no way to express it as a shorter program
	2:25
	then it means that your data is totally random there is no way to extract any regularity from it whatsoever now there
	2:32
	is little known mathematical theory behind this and the proofs of these statements actually not even that hard
	2:38
	but the one minor slight disappointment is that it's actually not possible at
	2:44
	least given today's tools and understanding to find the best short program that explains or generates or
	2:52
	solves your problem given your data this problem is computationally intractable
	'''

	unit_test(unit_test_text)

	unit_test_text_2 = '''
	5
	to talk a little bit about reinforcement learning so reinforcement learning is a framework it's a framework of evaluating
	6:53
	agents in their ability to achieve goals and complicated stochastic environments
	6:58
	you've got an agent which is plugged into an environment as shown in the figure right here and for any given
	7:06
	agent you can simply run it many times and compute its average reward now the
	7:13
	thing that's interesting about the reinforcement learning framework is that there exist interesting useful
	7:20
	reinforcement learning algorithms the framework existed for a long time it
	7:25
	became interesting once we realized that good algorithms exist now these are there are perfect algorithms but they
	7:31
	are good enough todo interesting things and all you want the mathematical
	7:37
	problem is one where you need to maximize the expected reward now one
	7:44
	important way in which the reinforcement learning framework is not quite complete is that it assumes that the reward is
	7:50
	given by the environment you see this picture the agent sends an action while
	7:56
	the reward sends it an observation in a both the observation and the reward backwards that's what the environment
	8:01
	communicates back the way in which this is not the case in the real world is that we figure out
	8:11
	what the reward is from the observation we reward ourselves we are not told
	8:16
	environment doesn't say hey here's some negative reward it's our interpretation over census that lets us determine what
	8:23
	the reward is and there is only one real true reward in life and this is
	8:28
	existence or nonexistence and everything else is a corollary of that so well what
	8:35
	should our agent be you already know the answer should be a neural network because whenever you want to do
	8:41
	something dense it's going to be a neural network and you want the agent to map observations to actions so you let
	8:47
	it be parametrized with a neural net and you apply learning algorithm so I want to explain to you how reinforcement
	8:53
	learning works this is model free reinforcement learning the reinforcement learning has actually been used in practice everywhere but it's
	'''

	unit_test(unit_test_text_2)

	unit_test_text_3 = '''
	ort try something new add
	9:17
	randomness directions and compare the result to your expectation if the result
	9:25
	surprises you if you find that the results exceeded your expectation then
	9:31
	change your parameters to take those actions in the future that's it this is
	9:36
	the fool idea of reinforcement learning try it out see if you like it and if you do do more of that in the future and
	9:44
	that's it that's literally it this is the core idea now it turns out it's not
	9:49
	difficult to formalize mathematically but this is really what's going on if in a neural network

	'''

	unit_test(unit_test_text_3)





	# Adding new functionality to the existing code
	text_without_timestamps = remove_timestamps(unit_test_text_2)
	top_words = extract_high_information_words(text_without_timestamps, 10)
	st.markdown("Top 10 High Information Words:")
	st.write(top_words)

	st.markdown("Relationship Graph:")
	display_relationship_graph(top_words)