|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- HuggingFaceH4/ultrafeedback_binarized |
|
language: |
|
- en |
|
library_name: transformers |
|
pipeline_tag: question-answering |
|
tags: |
|
- humman feedback |
|
- HH-RLHF |
|
- PPO |
|
- lama-1.3B |
|
--- |
|
|
|
# RLHF with ppo_Trainer and Lora |
|
|
|
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64c0be34e175dd56a57151ca/piXU-OqDgrBKs7qR7fICw.png) |
|
|
|
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64c0be34e175dd56a57151ca/OXD2TqlQQY9NuC7JTiv_H.png) |
|
|
|
# Hyperparameter |
|
|
|
#ppo |
|
learning_rate=5e-6, |
|
batch_size=32, |
|
mini_batch_size=1, |
|
horizon=10000, |
|
cliprange =0.2, |
|
cliprange_value=0.2, |
|
lam=0.95, |
|
target_kl=2, |
|
use_score_scaling = True, |
|
log_with='wandb' |
|
|
|
#lora |
|
r=16, |
|
lora_alpha=32, |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM", |
|
|