abdiharyadi commited on
Commit
834f6fb
1 Parent(s): 7894523

feat: add one-by-one generation to make it more alive

Browse files
Files changed (1) hide show
  1. app.py +121 -79
app.py CHANGED
@@ -8,104 +8,146 @@ kancilgpt = GPT2LMHeadModel.from_pretrained("abdiharyadi/kancilgpt")
8
 
9
  def generate_story():
10
  stop = False
11
- ellipsis = "..."
12
- while not stop:
13
- yield f"(memuat cerita {ellipsis})"
14
- ellipsis += "."
15
-
16
- gpt_input = gpt_tokenizer('<s> awal cerita | judul:', return_tensors='pt')
17
- gpt_out = kancilgpt.generate(
18
- **gpt_input,
19
- do_sample=True,
20
- max_length=512,
21
- pad_token_id=gpt_tokenizer.eos_token_id
22
- )
23
- result = gpt_tokenizer.decode(gpt_out[0])
24
- _, judul_prompt, isi, *end_part = result.split(" | ")
25
- end_part = "".join(end_part)
26
- _, *judul_words = judul_prompt.split()
27
- judul = " ".join(judul_words)
28
-
29
-
30
- if "</s>" in judul or "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
31
- print("Invalid output! Regenerating ....")
32
- continue
33
-
34
 
35
- quote_count = 0
36
- for c in isi:
37
- if c == "\"":
38
- quote_count += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- if quote_count % 2 != 0:
41
- print("Invalid output! Regenerating ....")
42
- continue
 
 
 
43
 
44
  stop = True
45
 
46
  total_isi = isi
47
 
48
- ellipsis = "..."
49
- while not end_part.startswith("tamat"):
50
- yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + f" {ellipsis}"
51
- ellipsis += "."
52
-
53
- i = 0
54
- in_quote = False
55
- end_sentence = False
56
- limit = 1750
57
- while i < len(isi) and not (end_sentence and (not in_quote) and isi[i] == " " and (len(isi) - i) < limit):
58
- if isi[i] == "\"":
59
- in_quote = not in_quote
60
-
61
- if end_sentence:
62
- end_sentence = isi[i] not in "abcdefghijklmnopqrstuvwxyz"
63
- else:
64
- end_sentence = isi[i] in ".?!"
 
65
 
66
- i += 1
67
- # i == len(isi) or end_sentence or (not in_quote) or isi[i] == " "
68
 
69
- while i < len(isi) and not (isi[i] in "abcdefghijklmnopqrstuvwxyz\""):
70
- i += 1
71
- # i == len(isi) or isi[i] in "abcdefghijklmnopqrstuvwxyz\""
72
 
73
- if i == len(isi):
74
- raise ValueError("What???")
75
 
76
- next_isi = isi[i:]
77
 
78
- stop = False
79
- while not stop:
80
- gpt_input = gpt_tokenizer(f'<s> pertengahan cerita | judul: {judul} | {next_isi}', return_tensors='pt')
81
- gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
82
- result = gpt_tokenizer.decode(gpt_out[0])
83
 
84
- _, judul_prompt, isi, *end_part = result.split(" | ")
85
- end_part = "".join(end_part)
86
- _, *judul_words = judul_prompt.split()
87
- judul = " ".join(judul_words)
88
 
89
- if isi[len(next_isi) + 1:].strip() != "":
90
- print(isi[len(next_isi) + 1:])
91
 
92
- if "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
93
- print("Invalid output! Regenerating ....")
94
- continue
95
 
96
- quote_count = 0
97
- for c in isi:
98
- if c == "\"":
99
- quote_count += 1
100
 
101
- if quote_count % 2 != 0:
102
- print("Invalid output! Regenerating ....")
103
- continue
104
 
105
- stop = True
106
 
107
- total_isi += " " + isi[len(next_isi) + 1:]
108
- ellipsis = "..."
109
 
110
  yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + "\n\ntamat."
111
 
 
8
 
9
  def generate_story():
10
  stop = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
+ prompt = "<s> awal cerita | judul:"
13
+ judul = ""
14
+ isi = ""
15
+ end_part = ""
16
+ isi_not_checked = True
17
+
18
+ yield "..."
19
+ while not stop:
20
+ prompt_stop = False
21
+ while not prompt_stop:
22
+ gpt_input = gpt_tokenizer(prompt, return_tensors='pt')
23
+ gpt_out = kancilgpt.generate(
24
+ **gpt_input,
25
+ do_sample=True,
26
+ max_new_tokens=1,
27
+ pad_token_id=gpt_tokenizer.eos_token_id,
28
+ eos_token_id=gpt_tokenizer.eos_token_id
29
+ )
30
+ gpt_out = gpt_out[0]
31
+
32
+ result = gpt_tokenizer.decode(gpt_out)
33
+ splitted_result = result.split(" | ")
34
+ if len(splitted_result) <= 2:
35
+ _, judul_prompt = splitted_result
36
+ _, *judul_words = judul_prompt.split()
37
+ judul = " ".join(judul_words)
38
+ isi = ""
39
+ end_part = ""
40
+
41
+ yield judul + "..."
42
+
43
+ if gpt_out[-1] == gpt_tokenizer.eos_token_id:
44
+ continue
45
+ else:
46
+ _, judul_prompt, isi, *end_part = splitted_result
47
+ end_part = "".join(end_part)
48
+ _, *judul_words = judul_prompt.split()
49
+ judul = " ".join(judul_words)
50
+
51
+ yield judul + "\n" + ("-" * len(judul)) + "\n" + isi + f"..."
52
+
53
+ if len(splitted_result) == 3:
54
+ if gpt_out[-1] == gpt_tokenizer.eos_token_id:
55
+ continue
56
+
57
+ elif isi_not_checked:
58
+ quote_count = 0
59
+ prev_i = 0
60
+ for i, c in enumerate(isi):
61
+ if c == "\"":
62
+ quote_count += 1
63
+ prev_i = i
64
+
65
+ if quote_count % 2 != 0:
66
+ print("Invalid isi!")
67
+ trimmed_isi = isi[:prev_i].strip()
68
+ prompt = f"<s> awal cerita | judul: {judul} | {trimmed_isi}"
69
+ continue
70
+
71
+ isi_not_checked = False
72
+
73
+ if gpt_out[-1] == gpt_tokenizer.eos_token_id or len(gpt_out) == 512:
74
+ prompt_stop = True
75
+ else:
76
+ prompt = result
77
 
78
+ # prompt_stop
79
+
80
+ if (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
81
+ print("Invalid ending! Regenerating ....")
82
+ prompt = f"<s> awal cerita | judul: {judul} | {isi} |"
83
+ continue
84
 
85
  stop = True
86
 
87
  total_isi = isi
88
 
89
+ print("We skip the rest of the part for debug.")
90
+ # ellipsis = "..."
91
+ # while not end_part.startswith("tamat"):
92
+ # yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + f" {ellipsis}"
93
+ # ellipsis += "."
94
+
95
+ # i = 0
96
+ # in_quote = False
97
+ # end_sentence = False
98
+ # limit = 1750
99
+ # while i < len(isi) and not (end_sentence and (not in_quote) and isi[i] == " " and (len(isi) - i) < limit):
100
+ # if isi[i] == "\"":
101
+ # in_quote = not in_quote
102
+
103
+ # if end_sentence:
104
+ # end_sentence = isi[i] not in "abcdefghijklmnopqrstuvwxyz"
105
+ # else:
106
+ # end_sentence = isi[i] in ".?!"
107
 
108
+ # i += 1
109
+ # # i == len(isi) or end_sentence or (not in_quote) or isi[i] == " "
110
 
111
+ # while i < len(isi) and not (isi[i] in "abcdefghijklmnopqrstuvwxyz\""):
112
+ # i += 1
113
+ # # i == len(isi) or isi[i] in "abcdefghijklmnopqrstuvwxyz\""
114
 
115
+ # if i == len(isi):
116
+ # raise ValueError("What???")
117
 
118
+ # next_isi = isi[i:]
119
 
120
+ # stop = False
121
+ # while not stop:
122
+ # gpt_input = gpt_tokenizer(f'<s> pertengahan cerita | judul: {judul} | {next_isi}', return_tensors='pt')
123
+ # gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
124
+ # result = gpt_tokenizer.decode(gpt_out[0])
125
 
126
+ # _, judul_prompt, isi, *end_part = result.split(" | ")
127
+ # end_part = "".join(end_part)
128
+ # _, *judul_words = judul_prompt.split()
129
+ # judul = " ".join(judul_words)
130
 
131
+ # if isi[len(next_isi) + 1:].strip() != "":
132
+ # print(isi[len(next_isi) + 1:])
133
 
134
+ # if "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
135
+ # print("Invalid output! Regenerating ....")
136
+ # continue
137
 
138
+ # quote_count = 0
139
+ # for c in isi:
140
+ # if c == "\"":
141
+ # quote_count += 1
142
 
143
+ # if quote_count % 2 != 0:
144
+ # print("Invalid output! Regenerating ....")
145
+ # continue
146
 
147
+ # stop = True
148
 
149
+ # total_isi += " " + isi[len(next_isi) + 1:]
150
+ # ellipsis = "..."
151
 
152
  yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + "\n\ntamat."
153