File size: 6,729 Bytes
f39338a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import streamlit as st
import re
import json
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from graphviz import Digraph
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
def remove_timestamps(text):
return re.sub(r'\d{1,2}:\d{2}\n', '', text)
def process_text(text):
lines = text.split("\n")
processed_lines = []
for line in lines:
if line:
processed_lines.append(line)
outline = ""
for i, line in enumerate(processed_lines):
if i % 2 == 0:
outline += f"**{line}**\n"
else:
outline += f"- {line} π\n"
return outline
def create_jsonl_list(text):
lines = text.split("\n")
jsonl_list = []
for line in lines:
if line:
jsonl_list.append({"text": line})
return jsonl_list
def unit_test(input_text):
st.write("Test Text without Timestamps:")
test_text_without_timestamps = remove_timestamps(input_text)
st.write(test_text_without_timestamps)
st.write("Test JSONL List:")
test_jsonl_list = create_jsonl_list(test_text_without_timestamps)
st.write(test_jsonl_list)
def extract_high_information_words(text, top_n=10):
words = nltk.word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
freq_dist = FreqDist(filtered_words)
high_information_words = [word for word, _ in freq_dist.most_common(top_n)]
return high_information_words
def create_relationship_graph(words):
graph = Digraph()
for index, word in enumerate(words):
graph.node(str(index), word)
if index > 0:
graph.edge(str(index - 1), str(index), label=str(index))
return graph
def display_relationship_graph(words):
graph = create_relationship_graph(words)
st.graphviz_chart(graph)
text_input = st.text_area("Enter text:", value="", height=300)
text_without_timestamps = remove_timestamps(text_input)
st.markdown("**Text without Timestamps:**")
st.write(text_without_timestamps)
processed_text = process_text(text_without_timestamps)
st.markdown("**Markdown Outline with Emojis:**")
st.markdown(processed_text)
unit_test_text = '''
1:42
program the does very very well on your data then you will achieve the best
1:48
generalization possible with a little bit of modification you can turn it into a precise theorem
1:54
and on a very intuitive level it's easy to see what it should be the case if you
2:01
have some data and you're able to find a shorter program which generates this
2:06
data then you've essentially extracted all the all conceivable regularity from
2:11
this data into your program and then you can use these objects to make the best predictions possible like if if you have
2:19
data which is so complex but there is no way to express it as a shorter program
2:25
then it means that your data is totally random there is no way to extract any regularity from it whatsoever now there
2:32
is little known mathematical theory behind this and the proofs of these statements actually not even that hard
2:38
but the one minor slight disappointment is that it's actually not possible at
2:44
least given today's tools and understanding to find the best short program that explains or generates or
2:52
solves your problem given your data this problem is computationally intractable
'''
unit_test(unit_test_text)
unit_test_text_2 = '''
5
to talk a little bit about reinforcement learning so reinforcement learning is a framework it's a framework of evaluating
6:53
agents in their ability to achieve goals and complicated stochastic environments
6:58
you've got an agent which is plugged into an environment as shown in the figure right here and for any given
7:06
agent you can simply run it many times and compute its average reward now the
7:13
thing that's interesting about the reinforcement learning framework is that there exist interesting useful
7:20
reinforcement learning algorithms the framework existed for a long time it
7:25
became interesting once we realized that good algorithms exist now these are there are perfect algorithms but they
7:31
are good enough todo interesting things and all you want the mathematical
7:37
problem is one where you need to maximize the expected reward now one
7:44
important way in which the reinforcement learning framework is not quite complete is that it assumes that the reward is
7:50
given by the environment you see this picture the agent sends an action while
7:56
the reward sends it an observation in a both the observation and the reward backwards that's what the environment
8:01
communicates back the way in which this is not the case in the real world is that we figure out
8:11
what the reward is from the observation we reward ourselves we are not told
8:16
environment doesn't say hey here's some negative reward it's our interpretation over census that lets us determine what
8:23
the reward is and there is only one real true reward in life and this is
8:28
existence or nonexistence and everything else is a corollary of that so well what
8:35
should our agent be you already know the answer should be a neural network because whenever you want to do
8:41
something dense it's going to be a neural network and you want the agent to map observations to actions so you let
8:47
it be parametrized with a neural net and you apply learning algorithm so I want to explain to you how reinforcement
8:53
learning works this is model free reinforcement learning the reinforcement learning has actually been used in practice everywhere but it's
'''
unit_test(unit_test_text_2)
unit_test_text_3 = '''
ort try something new add
9:17
randomness directions and compare the result to your expectation if the result
9:25
surprises you if you find that the results exceeded your expectation then
9:31
change your parameters to take those actions in the future that's it this is
9:36
the fool idea of reinforcement learning try it out see if you like it and if you do do more of that in the future and
9:44
that's it that's literally it this is the core idea now it turns out it's not
9:49
difficult to formalize mathematically but this is really what's going on if in a neural network
'''
unit_test(unit_test_text_3)
# Adding new functionality to the existing code
text_without_timestamps = remove_timestamps(unit_test_text_2)
top_words = extract_high_information_words(text_without_timestamps, 10)
st.markdown("**Top 10 High Information Words:**")
st.write(top_words)
st.markdown("**Relationship Graph:**")
display_relationship_graph(top_words)
|