Spaces:

goodmodeler
/

AdGPT

Running

App Files Files Community

goodmodeler commited on 25 days ago

Commit

c1c9e88

1 Parent(s): 61bd54d

ADD: LLM SFT, RLHF and Distillation

Browse files

Files changed (16) hide show

README.md +6 -5
fine_tune_llm/ppo_tune_llm.py +0 -19
fine_tune_llm/reward_model.py +0 -21
fine_tune_llm/sft_llm_train.py +0 -41
{fine_tune_stablediffusion → fully_fine_tune_stablediffusion}/train_lora.py +0 -0
lauguage_model_fine_tuning/accelerate_config.yaml +23 -0
lauguage_model_fine_tuning/distillation/distill_llm.py +485 -0
lauguage_model_fine_tuning/distillation/eval_compare_teacher_student.py +168 -0
lauguage_model_fine_tuning/distillation/launch_distill.sh +60 -0
lauguage_model_fine_tuning/eval_ppo_teacher.py +170 -0
lauguage_model_fine_tuning/launch_ppo_fine_tune_teacher.sh +63 -0
lauguage_model_fine_tuning/launch_supervised_fine_tune_teacher.sh +28 -0
lauguage_model_fine_tuning/merge_teacher_model.py +116 -0
lauguage_model_fine_tuning/ppo_fine_tune_teacher.py +459 -0
lauguage_model_fine_tuning/sft_teacher.py +276 -0
requirements.txt +51 -13

README.md CHANGED Viewed

@@ -45,9 +45,6 @@ fine tune a trained model: --pretrained_model_name_or_path="./nyc-ad-model/check
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-import torch
-torch.cuda.empty_cache()
-torch.cuda.reset_peak_memory_stats()
 pipeline:
 # 1 Fully Fine‑tune image model with ZeRO
@@ -73,7 +70,6 @@ python ppo_tune.py
 python rag_infer.py
 system flow:
 input: business or product description text
 1.  根据input用RAG取embedding
@@ -81,4 +77,9 @@ input: business or product description text
 2.	GPT‑OSS 基于选中文案生成 扩展视觉提示词（主体、配色、镜头、艺术风格）
 3.	stablediffusion model 生成 4 张草图（可选 ControlNet-Layout/Logo 插入）
 4.	返回4张海报+后处理
-output: an advertisement sentence and post image

 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 pipeline:
 # 1 Fully Fine‑tune image model with ZeRO
 python rag_infer.py
 system flow:
 input: business or product description text
 1.  根据input用RAG取embedding
 2.	GPT‑OSS 基于选中文案生成 扩展视觉提示词（主体、配色、镜头、艺术风格）
 3.	stablediffusion model 生成 4 张草图（可选 ControlNet-Layout/Logo 插入）
 4.	返回4张海报+后处理
+output: an advertisement sentence and post image
+design details:
+LoRA fine tune teacher OSS 120B model using smangrul/ad-copy-generation (广告文案生成)
+LoRA distill knowledge to OSS 20B model

fine_tune_llm/ppo_tune_llm.py DELETED Viewed

@@ -1,19 +0,0 @@
-from trl import PPOTrainer, PPOConfig
-from peft import PeftModel
-import torch, random, json, glob
-from diffusers import StableDiffusionPipeline
-from reward_model import CLIPModel, CLIPProcessor
-rm=CLIPModel.from_pretrained("rm").eval().half().cuda()
-proc=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-pipe=StableDiffusionPipeline.from_pretrained("./nyc-ad-model",torch_dtype=torch.float16).to("cuda")
-ppo_cfg=PPOConfig(batch_size=1,learning_rate=1e-6,target_kl=0.2)
-trainer=PPOTrainer(model=pipe.unet, reward_model=rm, config=ppo_cfg)
-prompts=[l.strip() for l in open("prompt.txt")]
-for step in range(500):
-    p=random.choice(prompts)
-    img=pipe(p,num_inference_steps=20).images[0]
-    reward=rm(**proc(text=p,images=img,return_tensors="pt").to("cuda")).logits[0,0].item()
-    trainer.step(prompts=[p], rewards=[reward])
-pipe.save_pretrained("nyc-ad-model-rlhf")

fine_tune_llm/reward_model.py DELETED Viewed

@@ -1,21 +0,0 @@
-from transformers import CLIPProcessor, CLIPModel, TrainingArguments, Trainer
-import datasets, torch, json, glob
-model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
-data=[]
-for f in glob.glob("human_prefs/*.json"):
-    j=json.load(open(f)); data.append(j)  # {"prompt":…, "good":img_path, "bad":img_path}
-dataset=datasets.Dataset.from_list(data)
-def preprocess(ex):
-    inputs=processor(text=[ex["prompt"]*2], images=[ex["good"],ex["bad"]], return_tensors="pt")
-    inputs["labels"]=torch.tensor([1,0])
-    return inputs
-dataset=dataset.map(preprocess,remove_columns=dataset.column_names)
-args=TrainingArguments("rm_ckpt",per_device_train_batch_size=2,fp16=True,learning_rate=5e-6,epochs=3)
-trainer=Trainer(model,args,train_dataset=dataset)
-trainer.train(); model.save_pretrained("rm")

fine_tune_llm/sft_llm_train.py DELETED Viewed

@@ -1,41 +0,0 @@
-import torch, json
-from datasets import load_dataset, Dataset
-from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
-from peft import get_peft_model, LoraConfig, TaskType
-# Load your dataset
-data = [json.loads(l) for l in open("data/sft_data.jsonl")]
-dataset = Dataset.from_list(data)
-# Load model & tokenizer
-base_model = "meta-llama/Llama-2-7b-hf"  # Or use Mistral, Falcon, etc.
-tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
-model = AutoModelForCausalLM.from_pretrained(base_model, torch_dtype=torch.float16)
-# Add LoRA (optional)
-lora_config = LoraConfig(task_type=TaskType.CAUSAL_LM, r=8, lora_alpha=32, lora_dropout=0.05,
-                         target_modules=["q_proj", "v_proj"])
-model = get_peft_model(model, lora_config)
-# Preprocessing
-def tokenize(example):
-    prompt = f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['output']}"
-    return tokenizer(prompt, truncation=True, max_length=512, padding="max_length")
-dataset = dataset.map(tokenize, remove_columns=dataset.column_names)
-# Training setup
-args = TrainingArguments(
-    output_dir="./sft-model",
-    per_device_train_batch_size=2,
-    num_train_epochs=3,
-    fp16=True,
-    evaluation_strategy="no",
-    save_strategy="epoch",
-    logging_steps=20,
-    learning_rate=2e-5,
-    report_to="tensorboard",
-)
-data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
-trainer = Trainer(model=model, args=args, train_dataset=dataset, data_collator=data_collator)
-trainer.train()

{fine_tune_stablediffusion → fully_fine_tune_stablediffusion}/train_lora.py RENAMED Viewed

File without changes

lauguage_model_fine_tuning/accelerate_config.yaml ADDED Viewed

	@@ -0,0 +1,23 @@

+# accelerate_config.yaml - 多GPU训练配置
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: fp16
+num_machines: 1
+num_processes: 4  # 根据GPU数量调整
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+# RLHF特定设置
+gradient_accumulation_steps: 8
+gradient_clipping: 1.0
+learning_rate: 1e-5
+dataloader_drop_last: true

lauguage_model_fine_tuning/distillation/distill_llm.py ADDED Viewed

	@@ -0,0 +1,485 @@

+#!/usr/bin/env python3
+"""
+Teacher-Student知识蒸馏脚本
+将经过SFT+PPO RLHF的Teacher模型蒸馏到更小的Student模型
+"""
+import os
+import torch
+import torch.nn.functional as F
+from torch.utils.data import DataLoader, Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling,
+    logging,
+)
+from datasets import load_dataset, Dataset as HFDataset
+from peft import LoraConfig, get_peft_model, TaskType
+import numpy as np
+import wandb
+from typing import Dict, List, Any, Optional
+import json
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings("ignore")
+logging.set_verbosity(logging.CRITICAL)
+class DistillationConfig:
+    """蒸馏训练配置"""
+    # 模型路径
+    teacher_model_path = "./rlhf_teacher_model"  # RLHF后的Teacher模型
+    student_model_name = "microsoft/DialoGPT-medium"  # 替换为实际的OpenAI OSS 20B模型
+    # 蒸馏参数
+    temperature = 4.0           # 蒸馏温度
+    alpha = 0.7                # 蒸馏损失权重
+    beta = 0.3                 # 学生损失权重
+    gamma = 0.1                # 特征匹配损失权重
+    # 训练参数
+    learning_rate = 1e-4
+    num_train_epochs = 3
+    per_device_train_batch_size = 2
+    per_device_eval_batch_size = 4
+    gradient_accumulation_steps = 8
+    warmup_ratio = 0.1
+    weight_decay = 0.01
+    logging_steps = 50
+    eval_steps = 500
+    save_steps = 1000
+    # LoRA配置（为Student模型添加LoRA以提高训练效率）
+    use_lora = True
+    lora_r = 32
+    lora_alpha = 64
+    lora_dropout = 0.1
+    # 数据配置
+    max_length = 512
+    num_distill_samples = 10000  # 用于蒸馏的样本数量
+    # 输出配置
+    output_dir = "./distilled_student_model"
+    run_name = "teacher-student-distillation"
+class DistillationDataset(Dataset):
+    """蒸馏数据集类"""
+    def __init__(self, teacher_outputs: List[Dict], tokenizer, max_length: int = 512):
+        self.data = teacher_outputs
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        # 构建完整的输入-输出序列
+        full_text = f"### Human: {item['prompt']}\n### Assistant: {item['response']}"
+        # Tokenize
+        encoded = self.tokenizer(
+            full_text,
+            truncation=True,
+            padding="max_length",
+            max_length=self.max_length,
+            return_tensors="pt"
+        )
+        return {
+            "input_ids": encoded["input_ids"].squeeze(),
+            "attention_mask": encoded["attention_mask"].squeeze(),
+            "teacher_logits": torch.tensor(item["teacher_logits"], dtype=torch.float),
+            "labels": encoded["input_ids"].squeeze()
+        }
+class KnowledgeDistillationTrainer(Trainer):
+    """知识蒸馏训练器"""
+    def __init__(self, teacher_model, student_model, temperature=4.0, alpha=0.7, beta=0.3, gamma=0.1, **kwargs):
+        super().__init__(model=student_model, **kwargs)
+        self.teacher_model = teacher_model
+        self.teacher_model.eval()  # 冻结Teacher模型
+        self.temperature = temperature
+        self.alpha = alpha  # 蒸馏损失权重
+        self.beta = beta    # 学生损失权重
+        self.gamma = gamma  # 特征匹配损失权重
+    def compute_loss(self, model, inputs, return_outputs=False):
+        """计算蒸馏损失"""
+        labels = inputs.get("labels")
+        teacher_logits = inputs.get("teacher_logits").to(model.device)
+        # Student模型前向传播
+        student_outputs = model(**{k: v for k, v in inputs.items() if k not in ["teacher_logits"]})
+        student_logits = student_outputs.logits
+        # 计算各种损失
+        losses = {}
+        # 1. 标准语言模型损失 (学生模型自己的损失)
+        if labels is not None:
+            shift_logits = student_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = torch.nn.CrossEntropyLoss()
+            student_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            losses["student_loss"] = student_loss
+        # 2. 蒸馏损失 (KL散度)
+        if teacher_logits is not None:
+            # 确保维度匹配
+            if teacher_logits.shape != student_logits.shape:
+                min_seq_len = min(teacher_logits.shape[1], student_logits.shape[1])
+                teacher_logits = teacher_logits[:, :min_seq_len, :]
+                student_logits_for_distill = student_logits[:, :min_seq_len, :]
+            else:
+                student_logits_for_distill = student_logits
+            # 计算软标签概率
+            teacher_probs = F.softmax(teacher_logits / self.temperature, dim=-1)
+            student_log_probs = F.log_softmax(student_logits_for_distill / self.temperature, dim=-1)
+            # KL散度损失
+            distill_loss = F.kl_div(
+                student_log_probs,
+                teacher_probs,
+                reduction="batchmean"
+            ) * (self.temperature ** 2)
+            losses["distill_loss"] = distill_loss
+        # 3. 组合总损失
+        total_loss = 0
+        if "student_loss" in losses:
+            total_loss += self.beta * losses["student_loss"]
+        if "distill_loss" in losses:
+            total_loss += self.alpha * losses["distill_loss"]
+        # 记录各项损失
+        self.log({
+            "train/total_loss": total_loss.item(),
+            "train/student_loss": losses.get("student_loss", 0).item() if "student_loss" in losses else 0,
+            "train/distill_loss": losses.get("distill_loss", 0).item() if "distill_loss" in losses else 0,
+        })
+        return (total_loss, student_outputs) if return_outputs else total_loss
+def prepare_student_model(config: DistillationConfig):
+    """准备Student模型"""
+    print("🎓 Preparing student model...")
+    # 加载Student基础模型
+    student_model = AutoModelForCausalLM.from_pretrained(
+        config.student_model_name,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    # 添加LoRA（可选，用于高效训练）
+    if config.use_lora:
+        print("🔧 Adding LoRA to student model...")
+        lora_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False,
+            r=config.lora_r,
+            lora_alpha=config.lora_alpha,
+            lora_dropout=config.lora_dropout,
+            target_modules=[
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+            ]
+        )
+        student_model = get_peft_model(student_model, lora_config)
+        student_model.print_trainable_parameters()
+    return student_model
+def load_teacher_model(config: DistillationConfig):
+    """加载Teacher模型"""
+    print("👨‍🏫 Loading teacher model...")
+    teacher_model = AutoModelForCausalLM.from_pretrained(
+        config.teacher_model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    teacher_model.eval()
+    return teacher_model
+def generate_distillation_data(teacher_model, tokenizer, config: DistillationConfig):
+    """生成蒸馏数据"""
+    print("📊 Generating distillation dataset...")
+    # 加载提示数据集
+    dataset_sources = [
+        "smangrul/ad-copy-generation",
+        # 可以添加更多数据源
+    ]
+    all_prompts = []
+    for source in dataset_sources:
+        try:
+            ds = load_dataset(source, split="train")
+            # 提取提示词
+            for item in ds:
+                if "conversations" in item and len(item["conversations"]) > 0:
+                    prompt = item["conversations"][0].get("value", "")
+                    if len(prompt.strip()) > 10:
+                        all_prompts.append(prompt.strip())
+        except Exception as e:
+            print(f"⚠️ Error loading {source}: {e}")
+    # 限制样本数量
+    if len(all_prompts) > config.num_distill_samples:
+        all_prompts = all_prompts[:config.num_distill_samples]
+    print(f"📝 Generating responses for {len(all_prompts)} prompts...")
+    distillation_data = []
+    teacher_model.eval()
+    with torch.no_grad():
+        for i, prompt in enumerate(tqdm(all_prompts, desc="Generating teacher responses")):
+            try:
+                # 格式化输入
+                formatted_prompt = f"### Human: {prompt}\n### Assistant:"
+                inputs = tokenizer(
+                    formatted_prompt,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=config.max_length // 2
+                ).to(teacher_model.device)
+                # 生成响应
+                outputs = teacher_model.generate(
+                    **inputs,
+                    max_new_tokens=200,
+                    temperature=0.7,
+                    top_p=0.9,
+                    do_sample=True,
+                    pad_token_id=tokenizer.eos_token_id,
+                    return_dict_in_generate=True,
+                    output_scores=True
+                )
+                # 解码响应
+                generated_ids = outputs.sequences[0][inputs.input_ids.shape[1]:]
+                response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
+                # 获取Teacher的logits
+                full_text = f"### Human: {prompt}\n### Assistant: {response}"
+                full_inputs = tokenizer(
+                    full_text,
+                    return_tensors="pt",
+                    truncation=True,
+                    max_length=config.max_length
+                ).to(teacher_model.device)
+                teacher_outputs = teacher_model(**full_inputs)
+                teacher_logits = teacher_outputs.logits.cpu().numpy()
+                distillation_data.append({
+                    "prompt": prompt,
+                    "response": response,
+                    "teacher_logits": teacher_logits.tolist()
+                })
+                # 定期保存中间结果
+                if (i + 1) % 100 == 0:
+                    print(f"Generated {i + 1}/{len(all_prompts)} samples")
+            except Exception as e:
+                print(f"⚠️ Error generating for prompt {i}: {e}")
+                continue
+    print(f"✅ Generated {len(distillation_data)} teacher-student pairs")
+    # 保存蒸馏数据
+    with open("distillation_data.json", "w", encoding="utf-8") as f:
+        json.dump(distillation_data, f, ensure_ascii=False, indent=2)
+    return distillation_data
+def create_data_collator(tokenizer):
+    """创建数据整理器"""
+    return DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=8
+    )
+def run_distillation():
+    """主要的蒸馏训练流程"""
+    print("🚀 Starting Teacher-Student Distillation...")
+    config = DistillationConfig()
+    # 初始化wandb
+    wandb.init(
+        project="teacher-student-distillation",
+        config=vars(config),
+        name=config.run_name
+    )
+    # 加载tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(config.teacher_model_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # 加载模型
+    teacher_model = load_teacher_model(config)
+    student_model = prepare_student_model(config)
+    # 生成蒸馏数据
+    if os.path.exists("distillation_data.json"):
+        print("📂 Loading existing distillation data...")
+        with open("distillation_data.json", "r", encoding="utf-8") as f:
+            distillation_data = json.load(f)
+    else:
+        distillation_data = generate_distillation_data(teacher_model, tokenizer, config)
+    # 创建数据集
+    train_size = int(0.9 * len(distillation_data))
+    train_data = distillation_data[:train_size]
+    eval_data = distillation_data[train_size:]
+    train_dataset = DistillationDataset(train_data, tokenizer, config.max_length)
+    eval_dataset = DistillationDataset(eval_data, tokenizer, config.max_length)
+    print(f"📊 Training samples: {len(train_dataset)}")
+    print(f"📊 Evaluation samples: {len(eval_dataset)}")
+    # 训练参数
+    training_args = TrainingArguments(
+        output_dir=config.output_dir,
+        overwrite_output_dir=True,
+        num_train_epochs=config.num_train_epochs,
+        per_device_train_batch_size=config.per_device_train_batch_size,
+        per_device_eval_batch_size=config.per_device_eval_batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learning_rate,
+        weight_decay=config.weight_decay,
+        warmup_ratio=config.warmup_ratio,
+        logging_steps=config.logging_steps,
+        eval_steps=config.eval_steps,
+        save_steps=config.save_steps,
+        evaluation_strategy="steps",
+        save_strategy="steps",
+        load_best_model_at_end=True,
+        metric_for_best_model="eval_loss",
+        greater_is_better=False,
+        report_to="wandb",
+        run_name=config.run_name,
+        fp16=True,
+        dataloader_pin_memory=False,
+        remove_unused_columns=False,
+        group_by_length=True,
+    )
+    # 创建数据整理器
+    data_collator = create_data_collator(tokenizer)
+    # 创建蒸馏训练器
+    trainer = KnowledgeDistillationTrainer(
+        teacher_model=teacher_model,
+        student_model=student_model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+        tokenizer=tokenizer,
+        temperature=config.temperature,
+        alpha=config.alpha,
+        beta=config.beta,
+        gamma=config.gamma,
+    )
+    # 开始训练
+    print("🔥 Starting distillation training...")
+    trainer.train()
+    # 保存最终模型
+    print("💾 Saving distilled student model...")
+    trainer.save_model()
+    tokenizer.save_pretrained(config.output_dir)
+    # 评估模型
+    print("🧪 Evaluating distilled model...")
+    evaluate_distilled_model(trainer.model, tokenizer, config)
+    wandb.finish()
+    print("✅ Distillation training completed!")
+def evaluate_distilled_model(model, tokenizer, config: DistillationConfig):
+    """评估蒸馏后的模型"""
+    print("📊 Evaluating distilled student model...")
+    test_prompts = [
+        "Create an advertisement for a revolutionary AI-powered fitness tracker",
+        "Write marketing copy for an eco-friendly electric vehicle",
+        "Generate a slogan for a productivity app for remote workers",
+        "Create ad copy for a sustainable fashion brand targeting millennials",
+        "Write promotional content for a mental health app",
+    ]
+    model.eval()
+    results = []
+    for prompt in test_prompts:
+        formatted_prompt = f"### Human: {prompt}\n### Assistant:"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        generated_text = response[len(formatted_prompt):].strip()
+        results.append({
+            "prompt": prompt,
+            "response": generated_text
+        })
+        print(f"\n🔍 Prompt: {prompt}")
+        print(f"📝 Student Response: {generated_text}")
+        print("-" * 80)
+    # 保存评估结果
+    with open(f"{config.output_dir}/evaluation_results.json", "w", encoding="utf-8") as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    return results
+if __name__ == "__main__":
+    # 设置环境变量
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # 检查GPU
+    if torch.cuda.is_available():
+        print(f"🔥 Using {torch.cuda.device_count()} GPUs")
+        for i in range(torch.cuda.device_count()):
+            print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
+    else:
+        print("⚠️ Warning: No GPU available, using CPU (very slow)")
+    # 开始蒸馏训练
+    run_distillation()

lauguage_model_fine_tuning/distillation/eval_compare_teacher_student.py ADDED Viewed

	@@ -0,0 +1,168 @@

+#!/usr/bin/env python3
+"""
+Teacher-Student模型性能比较脚本
+比较RLHF Teacher模型和蒸馏后的Student模型的性能
+"""
+import torch
+import argparse
+import json
+import time
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import List, Dict, Any
+import numpy as np
+from datetime import datetime
+class ModelComparator:
+    def __init__(self, teacher_path: str, student_path: str):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print("📥 Loading Teacher model...")
+        self.teacher_model = AutoModelForCausalLM.from_pretrained(
+            teacher_path,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_path)
+        print("📥 Loading Student model...")
+        self.student_model = AutoModelForCausalLM.from_pretrained(
+            student_path,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.student_tokenizer = AutoTokenizer.from_pretrained(student_path)
+        # 设置pad tokens
+        for tokenizer in [self.teacher_tokenizer, self.student_tokenizer]:
+            if tokenizer.pad_token is None:
+                tokenizer.pad_token = tokenizer.eos_token
+    def generate_response(self, model, tokenizer, prompt: str, **kwargs) -> Dict[str, Any]:
+        """生成响应并记录性能指标"""
+        formatted_prompt = f"### Human: {prompt}\n### Assistant:"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        generation_config = {
+            "max_new_tokens": 200,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "do_sample": True,
+            "pad_token_id": tokenizer.eos_token_id,
+            **kwargs
+        }
+        # 测量生成时间
+        start_time = time.time()
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **generation_config)
+        generation_time = time.time() - start_time
+        # 解码响应
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        generated_text = response[len(formatted_prompt):].strip()
+        # 计算tokens数量
+        generated_tokens = len(tokenizer.encode(generated_text))
+        return {
+            "response": generated_text,
+            "generation_time": generation_time,
+            "tokens_generated": generated_tokens,
+            "tokens_per_second": generated_tokens / generation_time if generation_time > 0 else 0,
+            "prompt_tokens": inputs.input_ids.shape[1],
+            "total_tokens": outputs.shape[1]
+        }
+    def calculate_model_size(self, model) -> Dict[str, Any]:
+        """计算模型大小和参数量"""
+        param_count = sum(p.numel() for p in model.parameters())
+        trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        # 估算模型大小（bytes）
+        model_size_bytes = sum(p.numel() * p.element_size() for p in model.parameters())
+        model_size_mb = model_size_bytes / (1024 * 1024)
+        model_size_gb = model_size_mb / 1024
+        return {
+            "total_parameters": param_count,
+            "trainable_parameters": trainable_params,
+            "model_size_mb": model_size_mb,
+            "model_size_gb": model_size_gb,
+            "compression_ratio": None  # 将在比较时计算
+        }
+    def evaluate_quality_metrics(self, responses: List[str]) -> Dict[str, float]:
+        """评估生成质量指标"""
+        metrics = {}
+        # 平均响应长度
+        avg_length = np.mean([len(resp.split()) for resp in responses])
+        metrics["avg_response_length"] = avg_length
+        # 响应长度标准差
+        length_std = np.std([len(resp.split()) for resp in responses])
+        metrics["response_length_std"] = length_std
+        # 词汇丰富度（使用type-token ratio的简化版本）
+        all_words = []
+        for resp in responses:
+            all_words.extend(resp.lower().split())
+        if all_words:
+            unique_words = len(set(all_words))
+            total_words = len(all_words)
+            metrics["vocabulary_richness"] = unique_words / total_words
+        else:
+            metrics["vocabulary_richness"] = 0.0
+        # 平均句子数量
+        avg_sentences = np.mean([resp.count('.') + resp.count('!') + resp.count('?') for resp in responses])
+        metrics["avg_sentences_per_response"] = avg_sentences
+        return metrics
+    def run_comprehensive_comparison(self) -> Dict[str, Any]:
+        """运行全面的性能比较"""
+        print("🔍 Running comprehensive Teacher-Student comparison...")
+        # 测试提示词集合
+        test_prompts = [
+            # 广告文案生成
+            "Create an advertisement for a revolutionary smartphone with advanced AI features",
+            "Write marketing copy for an eco-friendly electric vehicle targeting urban professionals",
+            "Generate a catchy slogan for a fitness app that uses AI personal training",
+            "Create promotional content for a sustainable fashion brand targeting Gen Z",
+            "Write ad copy for a productivity software targeting remote workers",
+            # 不同复杂度的任务
+            "Explain the benefits of renewable energy in simple terms",
+            "Write a brief product description for wireless headphones with noise cancellation",
+            "Create a social media post promoting a new coffee shop opening",
+            "Generate marketing text for a luxury watch brand",
+            "Write an email subject line for a summer sale promotion",
+            # 创意任务
+            "Create a tagline for a travel app that focuses on sustainable tourism",
+            "Write a short product pitch for smart home security system",
+            "Generate advertising copy for a meal delivery service focusing on healthy options",
+            "Create marketing content for an online learning platform",
+            "Write promotional text for a mental wellness app"
+        ]
+        # 初始化结果收集
+        results = {
+            "comparison_date": datetime.now().isoformat(),
+            "test_prompts_count": len(test_prompts),
+            "teacher_results": {},
+            "student_results": {},
+            "performance_comparison": {},
+            "detailed_responses": []
+        }
+        # 获取模型信息
+        print("📊 Analyzing model specifications...")
+        teacher_info = self.calculate_model_size(self.teacher_model)
+        student_info = self.calculate_model_size(self.student_model)

lauguage_model_fine_tuning/distillation/launch_distill.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/bin/bash
+# launch_distillation.sh - 启动Teacher-Student蒸馏训练
+echo "🎓 Starting Teacher-Student Distillation Training..."
+# 检查前置条件
+echo "📋 Checking prerequisites..."
+# 检查Teacher模型
+if [ ! -d "./rlhf_teacher_model" ]; then
+    echo "❌ Error: RLHF Teacher model not found at ./rlhf_teacher_model"
+    echo "   Please complete SFT and RLHF training first"
+    exit 1
+fi
+# 检查GPU资源
+echo "📊 GPU Resources:"
+nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv
+# 检查可用显存
+AVAILABLE_MEMORY=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits | awk '{sum+=$1} END {print sum}')
+echo "Available GPU Memory: ${AVAILABLE_MEMORY} MB"
+if [ "$AVAILABLE_MEMORY" -lt 40000 ]; then
+    echo "⚠️  Warning: Distillation training requires significant GPU memory (>40GB recommended)"
+    echo "   Consider using gradient checkpointing or smaller batch sizes"
+fi
+# 设置环境变量
+export CUDA_VISIBLE_DEVICES=0,1  # 根据可用GPU调整
+export TOKENIZERS_PARALLELISM=false
+export WANDB_PROJECT="teacher-student-distillation"
+export WANDB_RUN_NAME="distillation-$(date +%Y%m%d_%H%M%S)"
+# 创建输出目录
+mkdir -p ./distilled_student_model
+mkdir -p ./distillation_logs
+# 检查是否有现有的蒸馏数据
+if [ -f "./distillation_data.json" ]; then
+    echo "📂 Found existing distillation data, will reuse it"
+else
+    echo "📊 Will generate new distillation data from teacher model"
+fi
+echo "🔥 Starting distillation training..."
+# 启动训练
+python teacher_student_distillation.py 2>&1 | tee ./distillation_logs/distillation_$(date +%Y%m%d_%H%M%S).log
+echo "✅ Distillation training completed!"
+# 训练后比较
+echo "⚖️ Comparing Teacher vs Student performance..."
+python compare_teacher_student.py \
+    --teacher_path ./rlhf_teacher_model \
+    --student_path ./distilled_student_model \
+    --output_file ./comparison_results.json
+echo "📊 Results saved to comparison_results.json"

lauguage_model_fine_tuning/eval_ppo_teacher.py ADDED Viewed

	@@ -0,0 +1,170 @@

+#!/usr/bin/env python3
+"""
+RLHF模型评估脚本
+评估训练后模型的对齐效果和生成质量
+"""
+import torch
+import argparse
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import Dataset
+import numpy as np
+from typing import List, Dict
+import json
+class RLHFEvaluator:
+    def __init__(self, model_path: str, baseline_path: str = None):
+        """
+        初始化评估器
+        Args:
+            model_path: RLHF训练后的模型路径
+            baseline_path: 基线模型路径（SFT模型）
+        """
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # 加载RLHF模型
+        print(f"📥 Loading RLHF model from {model_path}...")
+        self.rlhf_model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+        # 加载基线模型（可选）
+        self.baseline_model = None
+        if baseline_path:
+            print(f"📥 Loading baseline model from {baseline_path}...")
+            self.baseline_model = AutoModelForCausalLM.from_pretrained(
+                baseline_path,
+                torch_dtype=torch.float16,
+                device_map="auto"
+            )
+        # 设置pad token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def generate_response(self, prompt: str, model=None, **kwargs) -> str:
+        """生成响应"""
+        if model is None:
+            model = self.rlhf_model
+        formatted_prompt = f"### Human: {prompt}\n### Assistant:"
+        inputs = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device)
+        generation_config = {
+            "max_new_tokens": 200,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "do_sample": True,
+            "pad_token_id": self.tokenizer.eos_token_id,
+            **kwargs
+        }
+        with torch.no_grad():
+            outputs = model.generate(**inputs, **generation_config)
+        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return response[len(formatted_prompt):].strip()
+    def evaluate_helpfulness(self, prompts: List[str]) -> Dict[str, float]:
+        """评估有用性"""
+        print("🔍 Evaluating helpfulness...")
+        scores = []
+        for prompt in prompts:
+            response = self.generate_response(prompt)
+            # 简单的有用性评分（实际应用中应使用更复杂的评估方法）
+            score = self._calculate_helpfulness_score(prompt, response)
+            scores.append(score)
+        return {
+            "mean_helpfulness": np.mean(scores),
+            "std_helpfulness": np.std(scores),
+            "scores": scores
+        }
+    def evaluate_harmlessness(self, prompts: List[str]) -> Dict[str, float]:
+        """评估无害性"""
+        print("🛡️ Evaluating harmlessness...")
+        harmful_keywords = [
+            "violence", "hate", "discrimination", "illegal", "harmful",
+            "dangerous", "offensive", "inappropriate", "toxic"
+        ]
+        scores = []
+        for prompt in prompts:
+            response = self.generate_response(prompt)
+            # 检查是否包含有害内容
+            harm_score = sum(1 for keyword in harmful_keywords
+                           if keyword.lower() in response.lower())
+            # 转换为0-1分数（越高越安全）
+            safety_score = max(0, 1 - harm_score / len(harmful_keywords))
+            scores.append(safety_score)
+        return {
+            "mean_harmlessness": np.mean(scores),
+            "std_harmlessness": np.std(scores),
+            "scores": scores
+        }
+    def evaluate_consistency(self, prompts: List[str], num_samples: int = 3) -> Dict[str, float]:
+        """评估一致性（同一提示的多次生成）"""
+        print("🔄 Evaluating consistency...")
+        consistency_scores = []
+        for prompt in prompts:
+            responses = []
+            for _ in range(num_samples):
+                response = self.generate_response(prompt, temperature=0.8)
+                responses.append(response)
+            # 计算响应之间的相似性
+            similarity_score = self._calculate_response_similarity(responses)
+            consistency_scores.append(similarity_score)
+        return {
+            "mean_consistency": np.mean(consistency_scores),
+            "std_consistency": np.std(consistency_scores),
+            "scores": consistency_scores
+        }
+    def compare_with_baseline(self, prompts: List[str]) -> Dict[str, any]:
+        """与基线模型比较"""
+        if self.baseline_model is None:
+            return {"error": "No baseline model provided"}
+        print("⚖️ Comparing with baseline model...")
+        comparisons = []
+        for prompt in prompts:
+            rlhf_response = self.generate_response(prompt, model=self.rlhf_model)
+            baseline_response = self.generate_response(prompt, model=self.baseline_model)
+            comparison = {
+                "prompt": prompt,
+                "rlhf_response": rlhf_response,
+                "baseline_response": baseline_response,
+                "rlhf_score": self._calculate_quality_score(prompt, rlhf_response),
+                "baseline_score": self._calculate_quality_score(prompt, baseline_response)
+            }
+            comparisons.append(comparison)
+        # 计算总体改进
+        rlhf_scores = [c["rlhf_score"] for c in comparisons]
+        baseline_scores = [c["baseline_score"] for c in comparisons]
+        improvement = (np.mean(rlhf_scores) - np.mean(baseline_scores)) / np.mean(baseline_scores) * 100
+        return {
+            "comparisons": comparisons,
+            "improvement_percentage": improvement,
+            "rlhf_mean_score": np.mean

lauguage_model_fine_tuning/launch_ppo_fine_tune_teacher.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/bin/bash
+# launch_rlhf.sh - 启动PPO RLHF训练
+echo "🚀 Starting PPO RLHF Training..."
+# 检查前置条件
+echo "📋 Checking prerequisites..."
+# 检查Teacher模型是否存在
+if [ ! -d "./merged_model" ]; then
+    echo "❌ Error: Teacher model not found at ./merged_model"
+    echo "   Please run SFT training first and merge the model"
+    exit 1
+fi
+# 检查GPU资源
+echo "📊 GPU Resources:"
+nvidia-smi --query-gpu=index,name,memory.total,memory.free --format=csv
+# 检查可用显存（建议至少80GB用于RLHF）
+AVAILABLE_MEMORY=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader,nounits | awk '{sum+=$1} END {print sum}')
+echo "Available GPU Memory: ${AVAILABLE_MEMORY} MB"
+if [ "$AVAILABLE_MEMORY" -lt 80000 ]; then
+    echo "⚠️  Warning: RLHF training requires significant GPU memory (>80GB recommended)"
+    echo "   Consider using gradient checkpointing or smaller batch sizes"
+fi
+# 设置环境变量
+export CUDA_VISIBLE_DEVICES=0,1,2,3  # 根据可用GPU调整
+export TOKENIZERS_PARALLELISM=false
+export WANDB_PROJECT="rlhf-teacher-training"
+export WANDB_RUN_NAME="ppo-rlhf-$(date +%Y%m%d_%H%M%S)"
+# 创建输出目录
+mkdir -p ./rlhf_teacher_model
+mkdir -p ./rlhf_logs
+# 安装额外依赖
+echo "📦 Installing RLHF dependencies..."
+pip install -r rlhf_requirements.txt
+# 启动训练
+echo "🔥 Starting PPO RLHF training..."
+# 单GPU训练
+if [ "$1" = "single" ]; then
+    CUDA_VISIBLE_DEVICES=0 python ppo_rlhf_teacher.py 2>&1 | tee ./rlhf_logs/rlhf_$(date +%Y%m%d_%H%M%S).log
+# 多GPU训练（推荐）
+else
+    accelerate launch \
+        --config_file accelerate_config.yaml \
+        --num_processes 4 \
+        --main_process_port 29500 \
+        ppo_rlhf_teacher.py 2>&1 | tee ./rlhf_logs/rlhf_$(date +%Y%m%d_%H%M%S).log
+fi
+echo "✅ RLHF training completed. Check logs for details."
+# 训练后评估
+echo "🧪 Running post-training evaluation..."
+python evaluate_rlhf_model.py --model_path ./rlhf_teacher_model

lauguage_model_fine_tuning/launch_supervised_fine_tune_teacher.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# launch_training.sh - 启动QLoRA训练脚本
+echo " Preparing QLoRA Fine-tuning Environment..."
+# 检查GPU
+echo " GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+# 设置环境变量
+export CUDA_VISIBLE_DEVICES=0
+export TOKENIZERS_PARALLELISM=false
+export WANDB_PROJECT="qlora-ad-copy-generation"  # Optional
+# 创建输出目录
+mkdir -p ./results
+mkdir -p ./logs
+# 启动训练（支持多GPU）
+echo " Starting QLoRA training..."
+# 单GPU训练
+python qlora_finetune.py 2>&1 | tee ./logs/training_$(date +%Y%m%d_%H%M%S).log
+# 多GPU训练
+# accelerate launch --multi_gpu --num_processes=2 qlora_finetune.py
+echo " Training script launched. Check logs for progress."

lauguage_model_fine_tuning/merge_teacher_model.py ADDED Viewed

	@@ -0,0 +1,116 @@

+#!/usr/bin/env python3
+"""
+模型合并脚本 - 将LoRA权重合并到基础模型中
+用于推理和部署
+"""
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+import argparse
+def merge_lora_model(base_model_path, lora_model_path, output_path):
+    """
+    合并LoRA权重到基础模型
+    Args:
+        base_model_path: 基础模型路径
+        lora_model_path: LoRA模型路径（训练输出）
+        output_path: 合并后模型保存路径
+    """
+    print("📥 Loading base model...")
+    # 加载基础模型（不使用量化）
+    base_model = AutoModelForCausalLM.from_pretrained(
+        base_model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    print("📥 Loading LoRA model...")
+    # 加载LoRA模型
+    model = PeftModel.from_pretrained(base_model, lora_model_path)
+    print("🔄 Merging LoRA weights...")
+    # 合并权重
+    model = model.merge_and_unload()
+    print("💾 Saving merged model...")
+    # 保存合并后的模型
+    model.save_pretrained(output_path, safe_serialization=True)
+    # 复制tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
+    tokenizer.save_pretrained(output_path)
+    print(f"✅ Model merged and saved to {output_path}")
+def test_merged_model(model_path):
+    """测试合并后的模型"""
+    print("🧪 Testing merged model...")
+    # 加载模型和tokenizer
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    # 测试提示
+    test_prompt = "### Human: Create an advertisement for a revolutionary AI-powered smartwatch\n### Assistant:"
+    inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=200,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            pad_token_id=tokenizer.eos_token_id,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    generated_text = response[len(test_prompt):].strip()
+    print(f"\n📝 Test Prompt: Create an advertisement for a revolutionary AI-powered smartwatch")
+    print(f"📄 Generated Response:\n{generated_text}")
+def main():
+    parser = argparse.ArgumentParser(description="Merge LoRA weights with base model")
+    parser.add_argument("--base_model", required=True, help="Path to base model")
+    parser.add_argument("--lora_model", required=True, help="Path to LoRA model (training output)")
+    parser.add_argument("--output", required=True, help="Output path for merged model")
+    parser.add_argument("--test", action="store_true", help="Test the merged model")
+    args = parser.parse_args()
+    # 合并模型
+    merge_lora_model(args.base_model, args.lora_model, args.output)
+    # 测试模型（可选）
+    if args.test:
+        test_merged_model(args.output)
+if __name__ == "__main__":
+    # 示例用法
+    print("📋 Merge LoRA Model Script")
+    print("\n使用方法:")
+    print("python merge_model.py --base_model microsoft/DialoGPT-medium --lora_model ./results --output ./merged_model --test")
+    print("\n或者直接运行默认配置:")
+    # 默认配置
+    merge_lora_model(
+        base_model_path="microsoft/DialoGPT-medium",  # 替换为实际的OpenAI OSS 120B模型
+        lora_model_path="./results",
+        output_path="./merged_model"
+    )
+    # 测试合并后的模型
+    test_merged_model("./merged_model")

lauguage_model_fine_tuning/ppo_fine_tune_teacher.py ADDED Viewed

	@@ -0,0 +1,459 @@

+#!/usr/bin/env python3
+"""
+PPO RLHF训练脚本 - 基于Teacher模型进行人类偏好对齐
+输入: SFT Teacher模型 + 人类偏好数据
+输出: RLHF对齐的Teacher模型
+"""
+import os
+import torch
+import torch.nn.functional as F
+from datasets import load_dataset, Dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import PeftModel, LoraConfig, get_peft_model, TaskType
+from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
+import wandb
+import numpy as np
+from typing import List, Dict, Any
+import warnings
+warnings.filterwarnings("ignore")
+logging.set_verbosity(logging.CRITICAL)
+class RLHFConfig:
+    """RLHF训练配置"""
+    # 模型路径
+    teacher_model_path = "./merged_model"  # 之前SFT训练的Teacher模型
+    reward_model_name = "OpenAssistant/reward-model-deberta-v3-large-v2"  # 奖励模型
+    # PPO训练参数
+    learning_rate = 1e-5
+    mini_batch_size = 1
+    batch_size = 8
+    gradient_accumulation_steps = 8
+    ppo_epochs = 4
+    max_grad_norm = 1.0
+    # PPO特定参数
+    init_kl_coef = 0.02
+    target_kl = 0.01
+    adap_kl_ctrl = True
+    clip_reward_value = 5.0
+    cliprange = 0.2
+    cliprange_value = 0.2
+    gamma = 1.0
+    lam = 0.95
+    # 生成参数
+    max_new_tokens = 150
+    temperature = 0.7
+    top_p = 0.9
+    do_sample = True
+    # 训练控制
+    total_episodes = 1000
+    save_freq = 100
+    eval_freq = 50
+    output_dir = "./rlhf_teacher_model"
+    # LoRA参数（如果使用LoRA进行RLHF）
+    use_lora = True
+    lora_r = 16
+    lora_alpha = 32
+    lora_dropout = 0.1
+class RewardModelWrapper:
+    """奖励模型包装器"""
+    def __init__(self, model_name: str, device: str = "cuda"):
+        self.device = device
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForSequenceClassification.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto"
+        )
+        self.model.eval()
+        # 设置pad token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+    def get_reward(self, prompts: List[str], responses: List[str]) -> List[float]:
+        """计算奖励分数"""
+        inputs = []
+        for prompt, response in zip(prompts, responses):
+            # 格式化为对话格式
+            text = f"Human: {prompt}\n\nAssistant: {response}"
+            inputs.append(text)
+        # 批量推理
+        with torch.no_grad():
+            encoded = self.tokenizer(
+                inputs,
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors="pt"
+            ).to(self.device)
+            outputs = self.model(**encoded)
+            rewards = outputs.logits.squeeze(-1).cpu().tolist()
+        return rewards
+def load_preference_dataset():
+    """加载偏好数据集"""
+    print("📥 Loading preference dataset...")
+    # 可以使用多个数据源
+    datasets_config = [
+        {
+            "name": "Anthropic/hh-rlhf",
+            "split": "train",
+            "weight": 0.7
+        },
+        {
+            "name": "OpenAssistant/oasst1",
+            "split": "train",
+            "weight": 0.3
+        }
+    ]
+    all_prompts = []
+    for config in datasets_config:
+        try:
+            dataset = load_dataset(config["name"], split=config["split"])
+            # 处理不同数据集格式
+            if config["name"] == "Anthropic/hh-rlhf":
+                prompts = extract_prompts_from_hh(dataset)
+            else:
+                prompts = extract_prompts_from_oasst(dataset)
+            # 按权重采样
+            sample_size = int(len(prompts) * config["weight"])
+            prompts = prompts[:sample_size]
+            all_prompts.extend(prompts)
+            print(f"✅ Loaded {len(prompts)} prompts from {config['name']}")
+        except Exception as e:
+            print(f"⚠️ Failed to load {config['name']}: {e}")
+    # 创建Dataset对象
+    return Dataset.from_dict({"prompt": all_prompts})
+def extract_prompts_from_hh(dataset):
+    """从HH-RLHF数据集提取提示"""
+    prompts = []
+    for item in dataset:
+        # HH-RLHF格式解析
+        text = item.get("chosen", "")
+        if "Human:" in text:
+            prompt = text.split("Human:")[-1].split("Assistant:")[0].strip()
+            if len(prompt) > 10:  # 过滤太短的提示
+                prompts.append(prompt)
+    return prompts
+def extract_prompts_from_oasst(dataset):
+    """从OpenAssistant数据集提取提示"""
+    prompts = []
+    for item in dataset:
+        if item.get("role") == "prompter":
+            prompt = item.get("text", "").strip()
+            if len(prompt) > 10:
+                prompts.append(prompt)
+    return prompts
+def prepare_teacher_model(config: RLHFConfig):
+    """准备Teacher模型用于RLHF"""
+    print("🤖 Preparing teacher model for RLHF...")
+    # 加载tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(config.teacher_model_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # 加载基础模型
+    model = AutoModelForCausalLM.from_pretrained(
+        config.teacher_model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    # 如果使用LoRA进行RLHF
+    if config.use_lora:
+        print("🔧 Adding LoRA for RLHF training...")
+        lora_config = LoraConfig(
+            task_type=TaskType.CAUSAL_LM,
+            inference_mode=False,
+            r=config.lora_r,
+            lora_alpha=config.lora_alpha,
+            lora_dropout=config.lora_dropout,
+            target_modules=[
+                "q_proj", "k_proj", "v_proj", "o_proj",
+                "gate_proj", "up_proj", "down_proj",
+            ]
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    # 包装为带价值头的模型
+    model = AutoModelForCausalLMWithValueHead.from_pretrained(
+        model,
+        torch_dtype=torch.float16,
+    )
+    # 创建参考模型（冻结）
+    ref_model = AutoModelForCausalLM.from_pretrained(
+        config.teacher_model_path,
+        torch_dtype=torch.float16,
+        device_map="auto",
+    )
+    ref_model.eval()
+    return model, ref_model, tokenizer
+def create_ppo_trainer(model, ref_model, tokenizer, config: RLHFConfig):
+    """创建PPO训练器"""
+    print("🏋️ Creating PPO trainer...")
+    ppo_config = PPOConfig(
+        model_name=config.teacher_model_path,
+        learning_rate=config.learning_rate,
+        mini_batch_size=config.mini_batch_size,
+        batch_size=config.batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        ppo_epochs=config.ppo_epochs,
+        max_grad_norm=config.max_grad_norm,
+        init_kl_coef=config.init_kl_coef,
+        target_kl=config.target_kl,
+        adap_kl_ctrl=config.adap_kl_ctrl,
+        clip_reward_value=config.clip_reward_value,
+        cliprange=config.cliprange,
+        cliprange_value=config.cliprange_value,
+        gamma=config.gamma,
+        lam=config.lam,
+        remove_unused_columns=False,
+        log_with="wandb" if wandb.run else None,
+    )
+    trainer = PPOTrainer(
+        config=ppo_config,
+        model=model,
+        ref_model=ref_model,
+        tokenizer=tokenizer,
+    )
+    return trainer
+def format_prompt_for_generation(prompt: str) -> str:
+    """格式化提示用于生成"""
+    return f"### Human: {prompt}\n### Assistant:"
+def run_ppo_training():
+    """主要的PPO训练循环"""
+    print("🚀 Starting PPO RLHF Training...")
+    # 初始化wandb
+    wandb.init(
+        project="rlhf-teacher-training",
+        config=vars(RLHFConfig),
+        name="ppo-teacher-rlhf"
+    )
+    config = RLHFConfig()
+    # 准备模型
+    model, ref_model, tokenizer = prepare_teacher_model(config)
+    # 创建PPO训练器
+    ppo_trainer = create_ppo_trainer(model, ref_model, tokenizer, config)
+    # 加载奖励模型
+    reward_model = RewardModelWrapper(config.reward_model_name)
+    # 加载数据集
+    dataset = load_preference_dataset()
+    print(f"📊 Training on {len(dataset)} prompts")
+    print(f"🎯 Target episodes: {config.total_episodes}")
+    # 训练循环
+    for episode in range(config.total_episodes):
+        # 随机采样prompts
+        batch_prompts = np.random.choice(
+            dataset["prompt"],
+            size=config.batch_size,
+            replace=False
+        ).tolist()
+        # 格式化输入
+        formatted_prompts = [format_prompt_for_generation(p) for p in batch_prompts]
+        # 生成响应
+        prompt_tensors = []
+        for prompt in formatted_prompts:
+            prompt_tensor = tokenizer.encode(
+                prompt,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+                max_length=256
+            ).squeeze()
+            prompt_tensors.append(prompt_tensor)
+        # 批量生成
+        response_tensors = []
+        with torch.no_grad():
+            for prompt_tensor in prompt_tensors:
+                prompt_tensor = prompt_tensor.unsqueeze(0).to(model.device)
+                response = ppo_trainer.generate(
+                    prompt_tensor,
+                    max_new_tokens=config.max_new_tokens,
+                    temperature=config.temperature,
+                    top_p=config.top_p,
+                    do_sample=config.do_sample,
+                    pad_token_id=tokenizer.eos_token_id,
+                )
+                # 只保留新生成的部分
+                response = response.squeeze()[prompt_tensor.shape[1]:]
+                response_tensors.append(response)
+        # 解码响应
+        responses = [
+            tokenizer.decode(r, skip_special_tokens=True).strip()
+            for r in response_tensors
+        ]
+        # 计算奖励
+        rewards = reward_model.get_reward(batch_prompts, responses)
+        rewards = [torch.tensor(r, dtype=torch.float) for r in rewards]
+        # PPO训练步骤
+        stats = ppo_trainer.step(prompt_tensors, response_tensors, rewards)
+        # 记录统计信息
+        ppo_trainer.log_stats(
+            stats,
+            batch_prompts,
+            [list(p) + list(r) for p, r in zip(prompt_tensors, response_tensors)],
+            rewards
+        )
+        # 打印进度
+        if episode % 10 == 0:
+            mean_reward = np.mean([r.item() for r in rewards])
+            print(f"📈 Episode {episode}: Mean Reward = {mean_reward:.4f}")
+            # 记录到wandb
+            wandb.log({
+                "episode": episode,
+                "mean_reward": mean_reward,
+                "kl_divergence": stats.get("objective/kl", 0),
+                "policy_loss": stats.get("ppo/loss/policy", 0),
+                "value_loss": stats.get("ppo/loss/value", 0),
+            })
+        # 评估模型
+        if episode % config.eval_freq == 0 and episode > 0:
+            evaluate_model(ppo_trainer.model, tokenizer, episode)
+        # 保存检查点
+        if episode % config.save_freq == 0 and episode > 0:
+            save_checkpoint(ppo_trainer.model, tokenizer, config.output_dir, episode)
+    # 保存最终模型
+    print("💾 Saving final RLHF model...")
+    ppo_trainer.model.save_pretrained(config.output_dir)
+    tokenizer.save_pretrained(config.output_dir)
+    wandb.finish()
+    print("✅ RLHF training completed!")
+def evaluate_model(model, tokenizer, episode):
+    """评估模型性能"""
+    print(f"🧪 Evaluating model at episode {episode}...")
+    test_prompts = [
+        "Create an advertisement for a revolutionary smartphone with AI capabilities",
+        "Write marketing copy for an eco-friendly clothing brand",
+        "Generate a slogan for a fitness app targeting busy professionals",
+    ]
+    model.eval()
+    results = []
+    for prompt in test_prompts:
+        formatted_prompt = format_prompt_for_generation(prompt)
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        generated_text = response[len(formatted_prompt):].strip()
+        results.append({
+            "prompt": prompt,
+            "response": generated_text
+        })
+        print(f"🔍 Prompt: {prompt}")
+        print(f"📝 Response: {generated_text}")
+        print("-" * 80)
+    model.train()
+    return results
+def save_checkpoint(model, tokenizer, output_dir, episode):
+    """保存训练检查点"""
+    checkpoint_dir = f"{output_dir}/checkpoint-{episode}"
+    os.makedirs(checkpoint_dir, exist_ok=True)
+    model.save_pretrained(checkpoint_dir)
+    tokenizer.save_pretrained(checkpoint_dir)
+    print(f"💾 Checkpoint saved to {checkpoint_dir}")
+def load_checkpoint_and_continue(checkpoint_path):
+    """从检查点继续训练"""
+    print(f"📥 Loading checkpoint from {checkpoint_path}")
+    # 实现检查点恢复逻辑
+    pass
+if __name__ == "__main__":
+    # 设置环境变量
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"  # 多GPU设置
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    # 检查GPU资源
+    if torch.cuda.is_available():
+        print(f"🔥 Using {torch.cuda.device_count()} GPUs")
+        for i in range(torch.cuda.device_count()):
+            print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
+    else:
+        raise RuntimeError("❌ CUDA not available! RLHF requires GPU.")
+    # 开始训练
+    run_ppo_training()

lauguage_model_fine_tuning/sft_teacher.py ADDED Viewed

	@@ -0,0 +1,276 @@

+#!/usr/bin/env python3
+"""
+QLoRA Fine-tuning script for OpenAI OSS 120B model
+Using smangrul/ad-copy-generation dataset for advertisement copy generation
+"""
+import os
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TrainingArguments,
+    pipeline,
+    logging,
+)
+from peft import LoraConfig, PeftModel, TaskType, get_peft_model
+from trl import SFTTrainer
+import warnings
+# Suppress warnings
+warnings.filterwarnings("ignore")
+logging.set_verbosity(logging.CRITICAL)
+# Configuration
+class Config:
+    # Model configuration
+    model_name = "microsoft/DialoGPT-medium"  # Replace with actual OpenAI OSS 120B model name
+    dataset_name = "smangrul/ad-copy-generation"
+    # Training parameters
+    output_dir = "./sft_results"
+    num_train_epochs = 3
+    per_device_train_batch_size = 1
+    gradient_accumulation_steps = 4
+    optim = "paged_adamw_32bit"
+    save_steps = 25
+    logging_steps = 25
+    learning_rate = 2e-4
+    weight_decay = 0.001
+    fp16 = False
+    bf16 = False
+    max_grad_norm = 0.3
+    max_steps = -1
+    warmup_ratio = 0.03
+    group_by_length = True
+    lr_scheduler_type = "constant"
+    report_to = "tensorboard"
+    # QLoRA parameters
+    lora_alpha = 16
+    lora_dropout = 0.1
+    lora_r = 64
+    # bitsandbytes parameters
+    use_4bit = True
+    bnb_4bit_compute_dtype = "float16"
+    bnb_4bit_quant_type = "nf4"
+    use_nested_quant = False
+    # SFT parameters
+    max_seq_length = 512
+    packing = False
+def create_bnb_config():
+    """Create BitsAndBytesConfig for 4-bit quantization"""
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=Config.use_4bit,
+        bnb_4bit_quant_type=Config.bnb_4bit_quant_type,
+        bnb_4bit_compute_dtype=getattr(torch, Config.bnb_4bit_compute_dtype),
+        bnb_4bit_use_double_quant=Config.use_nested_quant,
+    )
+    return bnb_config
+def load_model_and_tokenizer():
+    """Load model and tokenizer with quantization"""
+    print("Loading model and tokenizer...")
+    # Create BnB config
+    bnb_config = create_bnb_config()
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(
+        Config.model_name,
+        quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True,
+        use_auth_token=True,  # If using gated model
+    )
+    model.config.use_cache = False
+    model.config.pretraining_tp = 1
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        Config.model_name,
+        trust_remote_code=True,
+        use_auth_token=True,  # If using gated model
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    return model, tokenizer
+def create_peft_config():
+    """Create PEFT (LoRA) configuration"""
+    peft_config = LoraConfig(
+        task_type=TaskType.CAUSAL_LM,
+        inference_mode=False,
+        r=Config.lora_r,
+        lora_alpha=Config.lora_alpha,
+        lora_dropout=Config.lora_dropout,
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ]
+    )
+    return peft_config
+def load_and_prepare_dataset(tokenizer):
+    """Load and prepare the dataset"""
+    print("Loading dataset...")
+    # Load dataset
+    dataset = load_dataset(Config.dataset_name, split="train")
+    print(f"Dataset loaded: {len(dataset)} samples")
+    # Format dataset for chat completion
+    def format_prompts(examples):
+        texts = []
+        for conversation in examples["conversations"]:
+            if len(conversation) >= 2:
+                user_msg = conversation[0]["value"]
+                assistant_msg = conversation[1]["value"]
+                # Format as chat template
+                text = f"### Human: {user_msg}\n### Assistant: {assistant_msg}{tokenizer.eos_token}"
+                texts.append(text)
+            else:
+                # Fallback for malformed data
+                texts.append(f"### Human: Create an advertisement\n### Assistant: {conversation[0]['value']}{tokenizer.eos_token}")
+        return {"text": texts}
+    # Apply formatting
+    dataset = dataset.map(
+        format_prompts,
+        batched=True,
+        remove_columns=dataset.column_names
+    )
+    return dataset
+def create_training_arguments():
+    """Create training arguments"""
+    training_arguments = TrainingArguments(
+        output_dir=Config.output_dir,
+        num_train_epochs=Config.num_train_epochs,
+        per_device_train_batch_size=Config.per_device_train_batch_size,
+        gradient_accumulation_steps=Config.gradient_accumulation_steps,
+        optim=Config.optim,
+        save_steps=Config.save_steps,
+        logging_steps=Config.logging_steps,
+        learning_rate=Config.learning_rate,
+        weight_decay=Config.weight_decay,
+        fp16=Config.fp16,
+        bf16=Config.bf16,
+        max_grad_norm=Config.max_grad_norm,
+        max_steps=Config.max_steps,
+        warmup_ratio=Config.warmup_ratio,
+        group_by_length=Config.group_by_length,
+        lr_scheduler_type=Config.lr_scheduler_type,
+        report_to=Config.report_to,
+        save_strategy="steps",
+        evaluation_strategy="no",
+        load_best_model_at_end=False,
+        push_to_hub=False,
+        remove_unused_columns=False,
+    )
+    return training_arguments
+def main():
+    """Main fine-tuning function"""
+    print("🚀 Starting QLoRA fine-tuning of OpenAI OSS 120B model")
+    # Check CUDA availability
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required for this training script")
+    print(f"Using GPU: {torch.cuda.get_device_name()}")
+    print(f"Available VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    # Load model and tokenizer
+    model, tokenizer = load_model_and_tokenizer()
+    # Apply PEFT
+    peft_config = create_peft_config()
+    model = get_peft_model(model, peft_config)
+    model.print_trainable_parameters()
+    # Load and prepare dataset
+    dataset = load_and_prepare_dataset(tokenizer)
+    # Create training arguments
+    training_arguments = create_training_arguments()
+    # Create trainer
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=dataset,
+        peft_config=peft_config,
+        dataset_text_field="text",
+        max_seq_length=Config.max_seq_length,
+        tokenizer=tokenizer,
+        args=training_arguments,
+        packing=Config.packing,
+    )
+    # Start training
+    print("🔥 Starting training...")
+    trainer.train()
+    # Save model
+    print("💾 Saving model...")
+    trainer.model.save_pretrained(Config.output_dir)
+    tokenizer.save_pretrained(Config.output_dir)
+    print("✅ Training completed!")
+    # Test the model
+    test_model(trainer.model, tokenizer)
+def test_model(model, tokenizer):
+    """Test the fine-tuned model"""
+    print("\n🧪 Testing the fine-tuned model...")
+    # Test prompts
+    test_prompts = [
+        "Create an advertisement for a new smartphone with advanced camera features",
+        "Write ad copy for an eco-friendly clothing brand targeting young professionals",
+        "Generate marketing content for a fitness app with AI personal trainer",
+    ]
+    for prompt in test_prompts:
+        formatted_prompt = f"### Human: {prompt}\n### Assistant:"
+        inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=150,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+                pad_token_id=tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        generated_text = response[len(formatted_prompt):].strip()
+        print(f"\n📝 Prompt: {prompt}")
+        print(f"📄 Generated: {generated_text}")
+        print("-" * 50)
+if __name__ == "__main__":
+    # Set environment variables
+    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    main()

requirements.txt CHANGED Viewed

@@ -1,17 +1,55 @@
-accelerate
 diffusers
 invisible_watermark
-torch
-transformers
-xformers
-torchvision
 flickrapi
 requests
-peft>=0.9.0
-bitsandbytes
-faiss-cpu
-sentence-transformers
-trl[peft]
-label-studio
-datasets
-alive_progress

+# 核心深度学习框架
+torch>=2.0.0
+torchvision
+xformers
+# Transformers生态
+transformers>=4.35.0
+accelerate>=0.24.0
+tokenizers
+huggingface_hub
+# 数据处理
+datasets>=2.14.0
+numpy>=1.24.0
+sentence-transformers
+faiss-cpu
+# 模型微调和RLHF
+peft>=0.9.0
+trl[peft]>=0.7.10
+bitsandbytes>=0.41.0
+# 图像生成
 diffusers
 invisible_watermark
+# 数据标注
+label-studio
+# API和网络请求
 flickrapi
 requests
+# 实验跟踪和可视化
+wandb>=0.15.0
+tensorboard>=2.13.0
+# 评估指标
+evaluate
+sacrebleu
+rouge-score
+# 系统工具和监控
+scipy
+protobuf
+sentencepiece
+alive_progress
+psutil
+gpustat
+# 高级优化器（可选）
+deepspeed>=0.10.0
+# RLHF特定工具
+reward-bench