Spaces:

sakaltcommunity
/

Grape

Runtime error

App Files Files Community

Sakalti commited on Oct 12, 2024

Commit

d48bb37

verified ·

1 Parent(s): 9287bf6

Create app.py

Browse files

Files changed (1) hide show

app.py +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
+from datasets import load_dataset, Dataset, DatasetDict
+import os
+def train_and_deploy(write_token, repo_name, license_text):
+    # トークンを環境変数に設定
+    os.environ['HF_WRITE_TOKEN'] = write_token
+    # ライセンスファイルを作成
+    with open("LICENSE", "w") as f:
+        f.write(license_text)
+    # モデルとトークナイザーの読み込み
+    model_name = "EleutherAI/pythia-14m"  # トレーニング対象のモデル
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # FBK-MT/mosel データセットの読み込み
+    dataset = load_dataset("FBK-MT/mosel")
+    # データセットのキーを確認
+    print(f"Dataset keys: {dataset.keys()}")
+    if "train" not in dataset:
+        raise KeyError("The dataset does not contain a 'train' split.")
+    if "test" not in dataset:
+        raise KeyError("The dataset does not contain a 'test' split.")
+    # データセットの最初のエントリのキーを確認
+    print(f"Sample keys in 'train' split: {dataset['train'][0].keys()}")
+    # データセットのトークン化
+    def tokenize_function(examples):
+        try:
+            texts = examples['text']
+            return tokenizer(texts, padding="max_length", truncation=True, max_length=128)
+        except KeyError as e:
+            print(f"KeyError: {e}")
+            print(f"Available keys: {examples.keys()}")
+            raise
+    tokenized_datasets = dataset.map(tokenize_function, batched=True)
+    # トレーニング設定
+    training_args = TrainingArguments(
+        output_dir="./results",
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        logging_dir="./logs",
+        logging_steps=10,
+        num_train_epochs=3,  # トレーニングエポック数
+        push_to_hub=True,  # Hugging Face Hubにプッシュ
+        hub_token=write_token,
+        hub_model_id=repo_name  # ユーザーが入力したリポジトリ名
+    )
+    # Trainerの設定
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["test"],
+    )
+    # トレーニング実行
+    trainer.train()
+    # モデルをHugging Face Hubにプッシュ
+    trainer.push_to_hub()
+    return f"モデルが'{repo_name}'リポジトリにデプロイされました！"
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("### pythia トレーニングとデプロイ")
+    token_input = gr.Textbox(label="Hugging Face Write Token", placeholder="トークンを入力してください...")
+    repo_input = gr.Textbox(label="リポジトリ名", placeholder="デプロイするリポジトリ名を入力してください...")
+    license_input = gr.Textbox(label="ライセンス", placeholder="ライセンス情報を入力してください...")
+    output = gr.Textbox(label="出力")
+    train_button = gr.Button("デプロイ")
+    train_button.click(fn=train_and_deploy, inputs=[token_input, repo_input, license_input], outputs=output)
+demo.launch()