Spaces:

abdiharyadi
/

kancilgpt

Running

File size: 5,601 Bytes

f420881
58abf68
632ca18
f420881
58abf68
 
 
 
7894523
58abf68
 
834f6fb
 
 
 
 
 
 
 
 
 
 
 
 
 
f02f710
834f6fb
 
 
 
 
 
 
 
 
 
7a62361
834f6fb
7a62361
 
 
 
 
 
834f6fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a62361
834f6fb
 
 
 
 
5a389e5
834f6fb
 
 
58abf68
834f6fb
 
 
 
 
5a389e5
58abf68
 
 
 
 
834f6fb
9a0caf3
 
834f6fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58abf68
834f6fb
 
58abf68
834f6fb
 
 
58abf68
834f6fb
 
58abf68
834f6fb
58abf68
834f6fb
 
 
 
 
58abf68
834f6fb
 
 
 
58abf68
834f6fb
 
58abf68
834f6fb
 
 
58abf68
834f6fb
 
 
 
58abf68
834f6fb
 
 
58abf68
834f6fb
58abf68
834f6fb
 
58abf68
7894523
58abf68
 
 
 
 
 
 
 
f420881

import gradio as gr
from transformers import GPT2LMHeadModel
from indobenchmark import IndoNLGTokenizer

gpt_tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indogpt")
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token
kancilgpt = GPT2LMHeadModel.from_pretrained("abdiharyadi/kancilgpt")

def generate_story():
    stop = False

    prompt = "<s> awal cerita | judul:"
    judul = ""
    isi = ""
    end_part = ""
    isi_not_checked = True

    yield "..."
    while not stop:
        prompt_stop = False
        while not prompt_stop:
            gpt_input = gpt_tokenizer(prompt, return_tensors='pt')
            gpt_out = kancilgpt.generate(
                **gpt_input,
                do_sample=True,
                max_new_tokens=2,
                pad_token_id=gpt_tokenizer.eos_token_id,
                eos_token_id=gpt_tokenizer.eos_token_id
            )
            gpt_out = gpt_out[0]

            result = gpt_tokenizer.decode(gpt_out)
            splitted_result = result.split(" | ")
            if len(splitted_result) <= 2:
                _, judul_prompt = splitted_result
                _, *judul_words = judul_prompt.split()

                judul = " ".join(judul_words)
                yield judul + "..."
                if "." in judul:
                    print("Invalid judul!")
                    prompt = "<s> awal cerita | judul:"
                    continue

                isi = ""
                end_part = ""

                if gpt_out[-1] == gpt_tokenizer.eos_token_id:
                    continue
            else:
                _, judul_prompt, isi, *end_part = splitted_result
                end_part = "".join(end_part)
                _, *judul_words = judul_prompt.split()
                judul = " ".join(judul_words)

                yield judul + "\n" + ("-" * len(judul)) + "\n" + isi + f"..."

                if len(splitted_result) == 3:
                    if gpt_out[-1] == gpt_tokenizer.eos_token_id:
                        continue

                elif isi_not_checked:
                    quote_count = 0
                    prev_i = 0
                    for i, c in enumerate(isi):
                        if c == "\"":
                            quote_count += 1
                            prev_i = i

                    if quote_count % 2 != 0:
                        print("Invalid isi!")
                        trimmed_isi = isi[:prev_i].rstrip()
                        prompt = f"<s> awal cerita | judul: {judul} | {trimmed_isi}"
                        continue

                    isi_not_checked = False

            if gpt_out[-1] == gpt_tokenizer.eos_token_id:
                prompt_stop = True
            else:
                prompt = result

        # prompt_stop
        
        if (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
            print("Invalid ending! Regenerating ....")
            prompt = f"<s> awal cerita | judul: {judul} | {isi} |"
            continue

        stop = True

    total_isi = isi

    print("We skip the rest of the part for debug.")

    # TODO: Solve this.
    # ellipsis = "..."
    # while not end_part.startswith("tamat"):
    #     yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + f" {ellipsis}"
    #     ellipsis += "."

    #     i = 0
    #     in_quote = False
    #     end_sentence = False
    #     limit = 1750
    #     while i < len(isi) and not (end_sentence and (not in_quote) and isi[i] == " " and (len(isi) - i) < limit):
    #         if isi[i] == "\"":
    #             in_quote = not in_quote

    #         if end_sentence:
    #             end_sentence = isi[i] not in "abcdefghijklmnopqrstuvwxyz"
    #         else:
    #             end_sentence = isi[i] in ".?!"

    #         i += 1
    #     # i == len(isi) or end_sentence or (not in_quote) or isi[i] == " "

    #     while i < len(isi) and not (isi[i] in "abcdefghijklmnopqrstuvwxyz\""):
    #         i += 1
    #     # i == len(isi) or isi[i] in "abcdefghijklmnopqrstuvwxyz\""

    #     if i == len(isi):
    #         raise ValueError("What???")

    #     next_isi = isi[i:]

    #     stop = False
    #     while not stop:
    #         gpt_input = gpt_tokenizer(f'<s> pertengahan cerita | judul: {judul} | {next_isi}', return_tensors='pt')
    #         gpt_out = kancilgpt.generate(**gpt_input, do_sample=True, max_length=512, pad_token_id=gpt_tokenizer.eos_token_id)
    #         result = gpt_tokenizer.decode(gpt_out[0])

    #         _, judul_prompt, isi, *end_part = result.split(" | ")
    #         end_part = "".join(end_part)
    #         _, *judul_words = judul_prompt.split()
    #         judul = " ".join(judul_words)

    #         if isi[len(next_isi) + 1:].strip() != "":
    #             print(isi[len(next_isi) + 1:])

    #         if "</s>" in isi or "|" in isi or (not any(end_part.startswith(x) for x in ["bersambung", "tamat"])):
    #             print("Invalid output! Regenerating ....")
    #             continue

    #         quote_count = 0
    #         for c in isi:
    #             if c == "\"":
    #                 quote_count += 1

    #         if quote_count % 2 != 0:
    #             print("Invalid output! Regenerating ....")
    #             continue

    #         stop = True

    #     total_isi += " " + isi[len(next_isi) + 1:]
    #     ellipsis = "..."

    yield judul + "\n" + ("-" * len(judul)) + "\n" + total_isi + "\n\ntamat."

demo = gr.Interface(
    fn=generate_story,
    inputs=None,
    outputs=[
        gr.Textbox(label="cerita", lines=7)
    ]
)

demo.launch()