Spaces:
Running
Running
abdiharyadi
commited on
Commit
•
834f6fb
1
Parent(s):
7894523
feat: add one-by-one generation to make it more alive
Browse files
app.py
CHANGED
@@ -8,104 +8,146 @@ kancilgpt = GPT2LMHeadModel.from_pretrained("abdiharyadi/kancilgpt")
|
|
8 |
|
9 |
def generate_story():
|
10 |
stop = False
|
11 |
-
ellipsis = "..."
|
12 |
-
while not stop:
|
13 |
-
yield f"(memuat cerita {ellipsis})"
|
14 |
-
ellipsis += "."
|
15 |
-
|
16 |
-
gpt_input = gpt_tokenizer('<s> awal cerita | judul:', return_tensors='pt')
|
17 |
-
gpt_out = kancilgpt.generate(
|
18 |
-
**gpt_input,
|
19 |
-
do_sample=True,
|
20 |
-
max_length=512,
|
21 |
-
pad_token_id=gpt_tokenizer.eos_token_id
|
22 |
-
)
|
23 |
-
result = gpt_tokenizer.decode(gpt_out[0])
|
24 |
-
_, judul_prompt, isi, *end_part = result.split(" | ")
|
25 |
-
end_part = "".join(end_part)
|
26 |
-
_, *judul_words = judul_prompt.split()
|
27 |
-
judul = " ".join(judul_words)
|
28 |
-
|
29 |
-
|
30 |
-
if "</s>" in judul or "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
|
31 |
-
print("Invalid output! Regenerating ....")
|
32 |
-
continue
|
33 |
-
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
stop = True
|
45 |
|
46 |
total_isi = isi
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
65 |
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
|
73 |
-
|
74 |
-
|
75 |
|
76 |
-
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
|
105 |
-
|
106 |
|
107 |
-
|
108 |
-
|
109 |
|
110 |
yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + "\n\ntamat."
|
111 |
|
|
|
8 |
|
9 |
def generate_story():
|
10 |
stop = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
|
12 |
+
prompt = "<s> awal cerita | judul:"
|
13 |
+
judul = ""
|
14 |
+
isi = ""
|
15 |
+
end_part = ""
|
16 |
+
isi_not_checked = True
|
17 |
+
|
18 |
+
yield "..."
|
19 |
+
while not stop:
|
20 |
+
prompt_stop = False
|
21 |
+
while not prompt_stop:
|
22 |
+
gpt_input = gpt_tokenizer(prompt, return_tensors='pt')
|
23 |
+
gpt_out = kancilgpt.generate(
|
24 |
+
**gpt_input,
|
25 |
+
do_sample=True,
|
26 |
+
max_new_tokens=1,
|
27 |
+
pad_token_id=gpt_tokenizer.eos_token_id,
|
28 |
+
eos_token_id=gpt_tokenizer.eos_token_id
|
29 |
+
)
|
30 |
+
gpt_out = gpt_out[0]
|
31 |
+
|
32 |
+
result = gpt_tokenizer.decode(gpt_out)
|
33 |
+
splitted_result = result.split(" | ")
|
34 |
+
if len(splitted_result) <= 2:
|
35 |
+
_, judul_prompt = splitted_result
|
36 |
+
_, *judul_words = judul_prompt.split()
|
37 |
+
judul = " ".join(judul_words)
|
38 |
+
isi = ""
|
39 |
+
end_part = ""
|
40 |
+
|
41 |
+
yield judul + "..."
|
42 |
+
|
43 |
+
if gpt_out[-1] == gpt_tokenizer.eos_token_id:
|
44 |
+
continue
|
45 |
+
else:
|
46 |
+
_, judul_prompt, isi, *end_part = splitted_result
|
47 |
+
end_part = "".join(end_part)
|
48 |
+
_, *judul_words = judul_prompt.split()
|
49 |
+
judul = " ".join(judul_words)
|
50 |
+
|
51 |
+
yield judul + "\n" + ("-" * len(judul)) + "\n" + isi + f"..."
|
52 |
+
|
53 |
+
if len(splitted_result) == 3:
|
54 |
+
if gpt_out[-1] == gpt_tokenizer.eos_token_id:
|
55 |
+
continue
|
56 |
+
|
57 |
+
elif isi_not_checked:
|
58 |
+
quote_count = 0
|
59 |
+
prev_i = 0
|
60 |
+
for i, c in enumerate(isi):
|
61 |
+
if c == "\"":
|
62 |
+
quote_count += 1
|
63 |
+
prev_i = i
|
64 |
+
|
65 |
+
if quote_count % 2 != 0:
|
66 |
+
print("Invalid isi!")
|
67 |
+
trimmed_isi = isi[:prev_i].strip()
|
68 |
+
prompt = f"<s> awal cerita | judul: {judul} | {trimmed_isi}"
|
69 |
+
continue
|
70 |
+
|
71 |
+
isi_not_checked = False
|
72 |
+
|
73 |
+
if gpt_out[-1] == gpt_tokenizer.eos_token_id or len(gpt_out) == 512:
|
74 |
+
prompt_stop = True
|
75 |
+
else:
|
76 |
+
prompt = result
|
77 |
|
78 |
+
# prompt_stop
|
79 |
+
|
80 |
+
if (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
|
81 |
+
print("Invalid ending! Regenerating ....")
|
82 |
+
prompt = f"<s> awal cerita | judul: {judul} | {isi} |"
|
83 |
+
continue
|
84 |
|
85 |
stop = True
|
86 |
|
87 |
total_isi = isi
|
88 |
|
89 |
+
print("We skip the rest of the part for debug.")
|
90 |
+
# ellipsis = "..."
|
91 |
+
# while not end_part.startswith("tamat"):
|
92 |
+
# yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + f" {ellipsis}"
|
93 |
+
# ellipsis += "."
|
94 |
+
|
95 |
+
# i = 0
|
96 |
+
# in_quote = False
|
97 |
+
# end_sentence = False
|
98 |
+
# limit = 1750
|
99 |
+
# while i < len(isi) and not (end_sentence and (not in_quote) and isi[i] == " " and (len(isi) - i) < limit):
|
100 |
+
# if isi[i] == "\"":
|
101 |
+
# in_quote = not in_quote
|
102 |
+
|
103 |
+
# if end_sentence:
|
104 |
+
# end_sentence = isi[i] not in "abcdefghijklmnopqrstuvwxyz"
|
105 |
+
# else:
|
106 |
+
# end_sentence = isi[i] in ".?!"
|
107 |
|
108 |
+
# i += 1
|
109 |
+
# # i == len(isi) or end_sentence or (not in_quote) or isi[i] == " "
|
110 |
|
111 |
+
# while i < len(isi) and not (isi[i] in "abcdefghijklmnopqrstuvwxyz\""):
|
112 |
+
# i += 1
|
113 |
+
# # i == len(isi) or isi[i] in "abcdefghijklmnopqrstuvwxyz\""
|
114 |
|
115 |
+
# if i == len(isi):
|
116 |
+
# raise ValueError("What???")
|
117 |
|
118 |
+
# next_isi = isi[i:]
|
119 |
|
120 |
+
# stop = False
|
121 |
+
# while not stop:
|
122 |
+
# gpt_input = gpt_tokenizer(f'<s> pertengahan cerita | judul: {judul} | {next_isi}', return_tensors='pt')
|
123 |
+
# gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
|
124 |
+
# result = gpt_tokenizer.decode(gpt_out[0])
|
125 |
|
126 |
+
# _, judul_prompt, isi, *end_part = result.split(" | ")
|
127 |
+
# end_part = "".join(end_part)
|
128 |
+
# _, *judul_words = judul_prompt.split()
|
129 |
+
# judul = " ".join(judul_words)
|
130 |
|
131 |
+
# if isi[len(next_isi) + 1:].strip() != "":
|
132 |
+
# print(isi[len(next_isi) + 1:])
|
133 |
|
134 |
+
# if "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
|
135 |
+
# print("Invalid output! Regenerating ....")
|
136 |
+
# continue
|
137 |
|
138 |
+
# quote_count = 0
|
139 |
+
# for c in isi:
|
140 |
+
# if c == "\"":
|
141 |
+
# quote_count += 1
|
142 |
|
143 |
+
# if quote_count % 2 != 0:
|
144 |
+
# print("Invalid output! Regenerating ....")
|
145 |
+
# continue
|
146 |
|
147 |
+
# stop = True
|
148 |
|
149 |
+
# total_isi += " " + isi[len(next_isi) + 1:]
|
150 |
+
# ellipsis = "..."
|
151 |
|
152 |
yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + "\n\ntamat."
|
153 |
|