|
|
|
import streamlit as st |
|
import re |
|
import json |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk import FreqDist |
|
from graphviz import Digraph |
|
from collections import Counter |
|
|
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
|
|
def remove_timestamps(text): |
|
return re.sub(r'\d{1,2}:\d{2}\n', '', text) |
|
|
|
def process_text(text): |
|
lines = text.split("\n") |
|
processed_lines = [] |
|
|
|
for line in lines: |
|
if line: |
|
processed_lines.append(line) |
|
|
|
outline = "" |
|
for i, line in enumerate(processed_lines): |
|
if i % 2 == 0: |
|
outline += f"**{line}**\n" |
|
else: |
|
outline += f"- {line} π\n" |
|
|
|
return outline |
|
|
|
def create_jsonl_list(text): |
|
lines = text.split("\n") |
|
jsonl_list = [] |
|
|
|
for line in lines: |
|
if line: |
|
jsonl_list.append({"text": line}) |
|
|
|
return jsonl_list |
|
|
|
def unit_test(input_text): |
|
st.write("Test Text without Timestamps:") |
|
test_text_without_timestamps = remove_timestamps(input_text) |
|
st.write(test_text_without_timestamps) |
|
|
|
st.write("Test JSONL List:") |
|
test_jsonl_list = create_jsonl_list(test_text_without_timestamps) |
|
st.write(test_jsonl_list) |
|
|
|
|
|
|
|
def extract_high_information_words(text, top_n=10): |
|
words = nltk.word_tokenize(text) |
|
words = [word.lower() for word in words if word.isalpha()] |
|
|
|
stop_words = set(stopwords.words('english')) |
|
filtered_words = [word for word in words if word not in stop_words] |
|
|
|
freq_dist = FreqDist(filtered_words) |
|
high_information_words = [word for word, _ in freq_dist.most_common(top_n)] |
|
|
|
return high_information_words |
|
|
|
|
|
def create_relationship_graph(words): |
|
graph = Digraph() |
|
|
|
for index, word in enumerate(words): |
|
graph.node(str(index), word) |
|
|
|
if index > 0: |
|
graph.edge(str(index - 1), str(index), label=str(index)) |
|
|
|
return graph |
|
|
|
|
|
def display_relationship_graph(words): |
|
graph = create_relationship_graph(words) |
|
st.graphviz_chart(graph) |
|
|
|
|
|
|
|
|
|
text_input = st.text_area("Enter text:", value="", height=300) |
|
text_without_timestamps = remove_timestamps(text_input) |
|
|
|
st.markdown("**Text without Timestamps:**") |
|
st.write(text_without_timestamps) |
|
|
|
processed_text = process_text(text_without_timestamps) |
|
st.markdown("**Markdown Outline with Emojis:**") |
|
st.markdown(processed_text) |
|
|
|
unit_test_text = ''' |
|
1:42 |
|
program the does very very well on your data then you will achieve the best |
|
1:48 |
|
generalization possible with a little bit of modification you can turn it into a precise theorem |
|
1:54 |
|
and on a very intuitive level it's easy to see what it should be the case if you |
|
2:01 |
|
have some data and you're able to find a shorter program which generates this |
|
2:06 |
|
data then you've essentially extracted all the all conceivable regularity from |
|
2:11 |
|
this data into your program and then you can use these objects to make the best predictions possible like if if you have |
|
2:19 |
|
data which is so complex but there is no way to express it as a shorter program |
|
2:25 |
|
then it means that your data is totally random there is no way to extract any regularity from it whatsoever now there |
|
2:32 |
|
is little known mathematical theory behind this and the proofs of these statements actually not even that hard |
|
2:38 |
|
but the one minor slight disappointment is that it's actually not possible at |
|
2:44 |
|
least given today's tools and understanding to find the best short program that explains or generates or |
|
2:52 |
|
solves your problem given your data this problem is computationally intractable |
|
''' |
|
|
|
unit_test(unit_test_text) |
|
|
|
unit_test_text_2 = ''' |
|
5 |
|
to talk a little bit about reinforcement learning so reinforcement learning is a framework it's a framework of evaluating |
|
6:53 |
|
agents in their ability to achieve goals and complicated stochastic environments |
|
6:58 |
|
you've got an agent which is plugged into an environment as shown in the figure right here and for any given |
|
7:06 |
|
agent you can simply run it many times and compute its average reward now the |
|
7:13 |
|
thing that's interesting about the reinforcement learning framework is that there exist interesting useful |
|
7:20 |
|
reinforcement learning algorithms the framework existed for a long time it |
|
7:25 |
|
became interesting once we realized that good algorithms exist now these are there are perfect algorithms but they |
|
7:31 |
|
are good enough todo interesting things and all you want the mathematical |
|
7:37 |
|
problem is one where you need to maximize the expected reward now one |
|
7:44 |
|
important way in which the reinforcement learning framework is not quite complete is that it assumes that the reward is |
|
7:50 |
|
given by the environment you see this picture the agent sends an action while |
|
7:56 |
|
the reward sends it an observation in a both the observation and the reward backwards that's what the environment |
|
8:01 |
|
communicates back the way in which this is not the case in the real world is that we figure out |
|
8:11 |
|
what the reward is from the observation we reward ourselves we are not told |
|
8:16 |
|
environment doesn't say hey here's some negative reward it's our interpretation over census that lets us determine what |
|
8:23 |
|
the reward is and there is only one real true reward in life and this is |
|
8:28 |
|
existence or nonexistence and everything else is a corollary of that so well what |
|
8:35 |
|
should our agent be you already know the answer should be a neural network because whenever you want to do |
|
8:41 |
|
something dense it's going to be a neural network and you want the agent to map observations to actions so you let |
|
8:47 |
|
it be parametrized with a neural net and you apply learning algorithm so I want to explain to you how reinforcement |
|
8:53 |
|
learning works this is model free reinforcement learning the reinforcement learning has actually been used in practice everywhere but it's |
|
''' |
|
|
|
unit_test(unit_test_text_2) |
|
|
|
unit_test_text_3 = ''' |
|
ort try something new add |
|
9:17 |
|
randomness directions and compare the result to your expectation if the result |
|
9:25 |
|
surprises you if you find that the results exceeded your expectation then |
|
9:31 |
|
change your parameters to take those actions in the future that's it this is |
|
9:36 |
|
the fool idea of reinforcement learning try it out see if you like it and if you do do more of that in the future and |
|
9:44 |
|
that's it that's literally it this is the core idea now it turns out it's not |
|
9:49 |
|
difficult to formalize mathematically but this is really what's going on if in a neural network |
|
|
|
''' |
|
|
|
unit_test(unit_test_text_3) |
|
|
|
|
|
|
|
|
|
|
|
|
|
text_without_timestamps = remove_timestamps(unit_test_text_2) |
|
top_words = extract_high_information_words(text_without_timestamps, 10) |
|
st.markdown("**Top 10 High Information Words:**") |
|
st.write(top_words) |
|
|
|
st.markdown("**Relationship Graph:**") |
|
display_relationship_graph(top_words) |
|
|
|
|
|
|