ierhon commited on
Commit
410766c
·
1 Parent(s): d80c106

Upload 3 files

Browse files
Files changed (3) hide show
  1. dset_kel.txt +109 -0
  2. test_kel.py +34 -0
  3. train_kel.py +55 -0
dset_kel.txt ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Xai.→Xai.
2
+ hai.→Xai.
3
+ ... hai→Xai.
4
+ Ant, hai.→Xai.
5
+ Ant. Xai.→Xai.
6
+ Ant. hai.→Xai.
7
+ hai snepi→Xai.
8
+ snepi, Xai→Xai.
9
+ snepi, hai→Xai.
10
+ Xai snepi→Xai.
11
+ Tei trate?→At!
12
+ Tei noh?→Em trate!
13
+ tei not tonahe teies langa. haho.→👍️
14
+ Ameos noh.→Em trate!
15
+ Teies tai?→Emes tai aek! Teies?
16
+ Em eho. Tos.→Tos.
17
+ Eho tos...→Tos.
18
+ Eho hai...→Xai.
19
+ Eho nat...→Nat.
20
+ Eho at...→At!
21
+ Xai. Teies tai aek?→Emes tai aek! Teies?
22
+ Em nohame krate emes sait. Aek atea?→At!
23
+ Xame→Emta!
24
+ Xame, aek→Emta!
25
+ At→Aek.
26
+ Nat→Atemo
27
+ Tore nat!→Atemo
28
+ Tei tonahe teies tai aek?→At. Em notorama. Emes xuak tai nat.
29
+ Em knato teies tai xais.→Em nat tonahe. Em notorama.
30
+ Emta→Aek.
31
+ At. Em tane tas.→Aek
32
+ Tamos krate tei?→Ierhon krate em.
33
+ Tamos Ierhon?→Ierhon krate em.
34
+ Em tonahe.→Em eho aer toh nat mas.
35
+ Tei nat emta?→Em knato - at.
36
+ Tei knato?→Em tonahe mna toh.
37
+ Tei notorama?→At! Ierhon krate em glo em es haho.
38
+ Tei ChatGPT?→Nat, em mna notorama.
39
+ Tei eho xame toh.→Em nat tonahe mnor eho.
40
+ Tei eho hame ehat.→Em nat tonahe mnor eho.
41
+ Tei tonahe emes mnor eho?→Em nat tonahe mnor eho.
42
+ Em mnor eho?→Em nat tonahe mnor eho.
43
+ Eho emes mnor eho.→Em nat tonahe mnor eho.
44
+ Em tane atea nat.→Elc... Em knato... Leho aer lehyn trone.
45
+ Ec?→Em nat tonahe.
46
+ Tore?→Em nat tonahe.
47
+ Eho tore?→Em nat tonahe.
48
+ Tei aek.→Em mna notorama.
49
+ Tamos nat atemo?→Em.
50
+ Tei aer aek!→Em mna notorama.
51
+ Ant?→Teies eho atea?
52
+ Em tane atea nat→Eho glo em!
53
+ Teies ala aek lehyn trone?→Nymer N!
54
+ Teies ala aek lehaer trone?→Nymer N!
55
+ Ant, teies ala aek lehaer trone?→Nymer N!
56
+ Ant. Teies ala aek lehyn trone?→Nymer N!
57
+ Teies ala aek _ tamos?→Em nat tonahe.
58
+ Ant. Teies ala aek _ tamos?→Em nat tonahe.
59
+ Ant. Teies ala aek Nier automata tamos?→Em nat tonahe.
60
+ Ant. Teies ala aek Breaking bad tamos?→Em nat tonahe.
61
+ Ant. Teies ala aek Mentalist tamos?→Em nat tonahe.
62
+ J ala aek.→Atemo
63
+ I tonahe tas.→Elc eho tas!
64
+ Elc tane tas!→At!
65
+ Tei xais.→Atemo
66
+ 5+5=?→10
67
+ 2+2=?→4
68
+ 5*5=?→25
69
+ Tei nat mas eho!→Em nat mas. Eho .set_chance (1-100)
70
+ 👍️→Aek.
71
+ 🏹→Aek.
72
+ 😢→Xais.
73
+ 😭→Xais.
74
+ 😿→Xais.
75
+ 😁→Aek!
76
+ 😀→Aek!
77
+ 😃→Aek!
78
+ 😄→Aek!
79
+ 🤣→Xaho!
80
+ 😆→Xaho!
81
+ 😂→Xaho!
82
+ Xaho→Xaho!
83
+ Tei?→Em knato at.
84
+ Tei tonahe mna?→Em mna notorama.
85
+ A→C
86
+ C→E
87
+ E→I
88
+ I→K
89
+ 0→1
90
+ 1→2
91
+ 2→3
92
+ 3→4
93
+ 4→5
94
+ 5→6
95
+ 6→7
96
+ 7→8
97
+ 8→9
98
+ Tas es aek tai.→Aek!
99
+ Tos!→Tos.
100
+ Snepi, Tos!→Tos.
101
+ Snepi. Tos!→Tos.
102
+ Tos snepi!→Tos.
103
+ Em gouan.→Tos.
104
+ La tho sa ehk ra es mna...→ALA!!! ALA!!!
105
+ Tei tonahe nat→Xais.
106
+ Tei tonahe nat→Xais.
107
+ Eho aer lehaer trone?→At!
108
+ Snepi. Eho lehyn trone?→At!
109
+ Teies tehst?→Emes tehst es tho.
test_kel.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from keras.saving import load_model
3
+ from keras.preprocessing.text import Tokenizer
4
+ from keras_self_attention import SeqSelfAttention
5
+ from model_settings_kel import *
6
+ import json
7
+ from tokenizer import *
8
+
9
+
10
+ with open(dataset_file, "r") as f:
11
+ dset = json.load(f)
12
+
13
+ with open(responses_file, "r") as f:
14
+ lines = [x.rstrip("\n") for x in f.readlines()]
15
+
16
+ fit_on_texts(list(dset.keys()))
17
+
18
+ model = load_model("chatbot_kel.keras", custom_objects={"SeqSelfAttention": SeqSelfAttention})
19
+
20
+ def find_line_number(array):
21
+ return sorted(zip(list(array), [x for x in range(len(array))]), key=lambda x:x[0], reverse=True)[0][1] # yeah, one big line, find the biggest value and return the number of the line
22
+
23
+ def generate(text, verbose=1):
24
+ tokens = list(tokenize(text)) # text into tokens (almost words)
25
+ tokens = (tokens+[0,]*inp_len)[:inp_len] # cutting off the sentence after inp_len words
26
+ prediction = model.predict(np.array([tokens,]), verbose=verbose)[0]
27
+ line = find_line_number(prediction)
28
+ return lines[line]
29
+
30
+ if __name__ == "__main__": # if this code is not being imported, open the chat
31
+ while True:
32
+ inp = input("User: ")
33
+ gen = generate(inp)
34
+ if gen != "<null>": print(f"Bot: {gen}")
train_kel.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import json
3
+ from keras.optimizers import Adam, SGD
4
+ from keras.models import Sequential
5
+ from keras.layers import Embedding, Dense, Dropout, Flatten, PReLU, GaussianNoise
6
+ from tokenizer import *
7
+ from keras_self_attention import SeqSelfAttention, SeqWeightedAttention
8
+ from model_settings_kel import *
9
+
10
+
11
+ with open(dataset_file, "r") as f:
12
+ dset = json.load(f)
13
+
14
+ with open(responses_file, "r") as f: # TODO: add support to a json-only dataset
15
+ dset_size = len(f.readlines())
16
+
17
+ fit_on_texts(list(dset.keys()))
18
+
19
+ vocab_size = len(ind2text) + 1
20
+
21
+ model = Sequential()
22
+ model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=inp_len))
23
+ model.add(SeqSelfAttention()) # an ATTENTION LAYER makes the model LEARN the MAIN INFORMATION in the text, AND NOT the TEXT ITSELF
24
+ model.add(Flatten()) # SelfAttention and the embedding layer outputs a 2D array, it's a list of words with a list of numbers for each word
25
+ model.add(Dense(512, activation="linear")) # 1024 relu neurons, why? 2 to the power of 10 is 1024 and I'm a fan of ReLU, it's double-fast (fast training and fast to compute function, no division, square roots or powers, just (x>0)*x ) and overall cool
26
+ model.add(PReLU())
27
+ model.add(Dropout(0.5)) # dropout makes ___ task harder __ removing ____ information, 0.5 means delete 50% (it resets neurons to 0 so the model will truly focus on what's important, and not learn on some data that's there by accident)
28
+ model.add(GaussianNoise(0.1))
29
+ model.add(Dense(256, activation="relu"))
30
+ model.add(Dense(128, activation="relu"))
31
+ model.add(Dense(dset_size, activation="softmax")) # softmax is made for output, if the output should have only 1 neuron active, that means only one positive number is allowed and other are zeros
32
+
33
+ model.summary()
34
+
35
+ X = [] # we're loading the training data into input X
36
+ y = [] # and output y
37
+
38
+ for key in dset:
39
+ tokens = tokenize(key)
40
+ X.append(np.array((list(tokens)+[0,]*inp_len)[:inp_len])) # refusing to use pad_sequences for an unspecified reason and creating the worst line of code
41
+ output_array = np.zeros(dset_size)
42
+ output_array[dset[key]] = 1 # 0 0 0 1 0 0 0 0 0, the neuron of the each line activates in the correct response
43
+ y.append(output_array)
44
+
45
+ X = np.array(X) # normal lists are way slower than numpy arrays (remember, a list and an array is not the same thing, an array is far more limited)
46
+ y = np.array(y) # that's why keras supports only numpy arrays ^
47
+
48
+ model.compile(optimizer=Adam(learning_rate=0.001), loss="categorical_crossentropy", metrics=["accuracy",]) # settings for the training, loss means the way to calculate loss - categorical crossentropy
49
+
50
+ model.fit(X, y, epochs=128, batch_size=10, workers=4, use_multiprocessing=True) # training the model, epochs means how many times does it have to read the data, batch_size is an optimization to train on multiple messages at the same time. Loss and accuracy are the opposite things, loss is how far the output is from a correct one, from 1 to 0, and accuracy how often does the model get the answer right, from 0 to 1.
51
+ # Add , workers=4, use_multiprocessing=True) if you don't have a GPU
52
+
53
+ model.summary() # just for you to see info about the model, useful because you can check the parameter count
54
+
55
+ model.save("chatbot_kel.keras")