kmd2525's picture
Update README.md
2fa7c17 verified
metadata
base_model: llm-jp/llm-jp-3-13b
tags:
  - text-generation-inference
  - transformers
  - unsloth
  - llama
  - trl
license: apache-2.0
language:
  - en
  - ja
pipeline_tag: question-answering

Uploaded model

  • Developed by: kmd2525
  • License: apache-2.0
  • Finetuned from model : llm-jp/llm-jp-3-13b

This llama model was trained 2x faster with Unsloth and Huggingface's TRL library.

Usage

Execute following code in Google Colab using L4 GPU.

  • System Configuration:

    • System RAM: 53.0 GB
    • GPU RAM: 22.5 GB
    • Disk Capacity: 235.7GB
  • It took 35 minutes.

# ライブラリのインストール
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# ライブラリの読込み
import os
import json
import re
from datetime import datetime, timedelta, timezone
import torch
from tqdm.notebook import tqdm
from datasets import load_dataset
from unsloth import FastLanguageModel
from google.colab import userdata

# Hugging Faceの認証
# 注意: Hugging Faceからsecret keyを取得し、colabでsecretを設定をすること。
# 初回実行時は、認証のモーダルが表示されるので、GUI操作でアクセス許可をする。
HF_TOKEN = userdata.get("HF_TOKEN")  


# モデルの読込み
MODEL_NAME = "kmd2525/llm-jp-3-13b-it-r64-ichikara-fix"  # 学習したLoRAのモデルの指定(Hugging FaceのIDを指定)。
MAX_SWQ_LENGTH = 1024
DTYPE = None  # Noneにしておけば自動で設定
LOAD_IN_4BIT = True  # 今回は13Bモデルを扱うためTrue

os.environ["llm_int8_enable_fp32_cpu_offload"] = "True"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SWQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
    token=HF_TOKEN,
    device_map="auto",
)
FastLanguageModel.for_inference(model)


# 評価対象となるデータの読込み。
# 注意: 事前に、colabの/content直下にデータをアップロードすること。
DATASET_PATH = "./elyza-tasks-100-TV_0.jsonl"

datasets = []
with open(DATASET_PATH, "r") as f:
    item = ""
    for line in f:
      line = line.strip()
      item += line
      if item.endswith("}"):
        datasets.append(json.loads(item))
        item = ""

# 推論
MAX_NEW_TOKENS = 1024

category_prompts = {
    "creative": "創作的な回答になるよう表現を工夫してください。",
    "summarize": "与えられた情報を正確かつ簡潔にまとめることが求められます。必要な情報を的確に抽出してください。",
    "knowledge": "事実ベースの情報や知識提供が求められます。一般的に認められた情報に基づき、回答してください。",
    "advice": "実用的なアドバイスや問題解決の提案が求められます。実用性のある回答になるよう工夫してください。",
    "analysis": "論理的思考や推論・分析力が問われます。指示の情報を注意深く読んで、結論を導いてください。",
    "format": "指定形式や特定のスタイルに従う回答が求められます。指示にある回答形式を守って回答してください。",
    "evaluation": "評価・採点・判定が求められます。与えられた基準に沿って、公平かつ一貫した評価を行ってください。"
}

def get_category_prompt(task_id: int) -> str:
    # カテゴリーに分類
    creative_ids = [1, 32, 68, 84, 91, 92, 93, 98]
    summarize_ids = [24, 73]
    knowledge_ids = [2, 7, 9, 14, 19, 20, 21, 22, 27, 28, 30, 39, 50, 52, 53, 69]
    advice_ids = [0, 4, 13, 15, 31, 35, 36, 85, 86, 87, 96]
    analysis_ids = [3, 6, 8, 17, 25, 26, 33, 34, 37, 38, 39, 40, 41, 42, 43, 45, 46, 47,
                    48, 49, 51, 54, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 70, 71,
                    74, 76, 77, 79, 80, 81, 82, 89, 94, 95, 97, 99]
    format_ids = [10, 11, 12, 88, 90]
    evaluation_ids = [5]

    if task_id in creative_ids:
        return category_prompts["creative"]
    elif task_id in summarize_ids:
        return category_prompts["summarize"]
    elif task_id in knowledge_ids:
        return category_prompts["knowledge"]
    elif task_id in advice_ids:
        return category_prompts["advice"]
    elif task_id in analysis_ids:
        return category_prompts["analysis"]
    elif task_id in format_ids:
        return category_prompts["format"]
    elif task_id in evaluation_ids:
        return category_prompts["evaluation"]
    else:
        return "なし"

def generate_response(task_id: int, input_text: str) -> str:
    category_prompt = get_category_prompt(task_id)
    prompt = f"""
    以下の指示の内容を読み、回答せよ。方針も適宜、参考にせよ。
    ### 指示
    {input_text}

    ### 方針
    {category_prompt}

    ### 回答
    """
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    outputs = model.generate(
      **inputs,
      max_new_tokens=MAX_NEW_TOKENS,
      use_cache=True,
      do_sample=False,
      repetition_penalty=1.2,
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### 回答")[-1]
    return prediction

results = []
for i in tqdm(range(len(datasets))):
    task_id = datasets[i]["task_id"]
    input_text = datasets[i]["input"]
    category_prompt = get_category_prompt(task_id)
    prediction = generate_response(task_id, input_text)
    results.append({"task_id": task_id, "input": input_text, "attention": category_prompt, "output": prediction})

# 出力値の後処理
def remove_invalid_chars(text: str) -> str:
  # Remove characters like \u3000 and other control characters
  text = re.sub(r'[\u3000\u0000-\u001F\u007F-\u009F]', '', text)  # \u3000とその他の制御文字の削除
  text = re.sub(r"[*#]", "", text)  # *と#の削除
  text = text.strip()  # 両端の空白を削除
  return text

cleaned_results = []
for result in results:
  cleaned_result = {
      "task_id": result["task_id"],
      "input": remove_invalid_chars(result["input"]),
      "attention": result["attention"],
      "output": remove_invalid_chars(result["output"])
  }
  cleaned_results.append(cleaned_result)

# 結果をjsonlで、/content直下に保存。
model_name = re.sub(".*/", "", MODEL_NAME)
current_datetime = datetime.now(timezone(timedelta(hours=+9))).strftime("%Y%m%d_%H%M")
SUBMIT_PATH = f"/content/{model_name}_{current_datetime}_outputs.jsonl"

with open(SUBMIT_PATH, "w", encoding="utf-8") as f:
    for cleaned_result in cleaned_results:
        json.dump(cleaned_result, f, ensure_ascii=False)  # ensure_ascii=False for handling non-ASCII characters
        f.write("\n")

Datasets

Instruction tuning

The models have been fine-tuned on the following datasets.

Language Dataset description
Japanese Ichikara Instruction A manually constructed instruction dataset based on Ichikara Instruction
Japanese Elyza-tasks-100