|
--- |
|
base_model: google/gemma-2-9b |
|
tags: |
|
- text-generation-inference |
|
- transformers |
|
- torch |
|
- mlx_lm |
|
|
|
license: apache-2.0 |
|
language: |
|
- en |
|
--- |
|
|
|
# 推論用コード |
|
Hugging Faceにアップロードしたモデルを用いてELYZA-tasks-100-TVの出力を得るためのコードです。 |
|
このコードで生成されたjsonlファイルは課題の成果として提出可能なフォーマットになっております。 |
|
|
|
``` |
|
!pip install -U bitsandbytes |
|
!pip install -U transformers |
|
!pip install -U accelerate |
|
!pip install -U datasets |
|
!pip install -U peft |
|
``` |
|
|
|
``` |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import json |
|
from pathlib import Path |
|
from typing import Dict, Any, Optional |
|
from tqdm import tqdm |
|
import time |
|
from datetime import datetime |
|
|
|
class GPUPredictions: |
|
def __init__(self, |
|
model_id="testmoto/gemma-2-lora-dpo-moe-1", |
|
adapter_path=None, |
|
max_tokens=1024, |
|
temp=0.0, |
|
top_p=0.9, |
|
seed=3407): |
|
self.model_id = model_id |
|
self.adapter_path = adapter_path |
|
self.max_tokens = max_tokens |
|
self.temp = temp |
|
self.top_p = top_p |
|
self.seed = seed |
|
|
|
print(f"Loading model: {model_id}") |
|
torch.cuda.empty_cache() |
|
|
|
# GPU設定 |
|
n_gpus = torch.cuda.device_count() |
|
max_memory = {i: "20GiB" for i in range(n_gpus)} |
|
max_memory["cpu"] = "100GiB" |
|
|
|
# トークナイザーの初期化 |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
model_id, |
|
trust_remote_code=True |
|
) |
|
if self.tokenizer.pad_token is None: |
|
self.tokenizer.pad_token = self.tokenizer.eos_token |
|
|
|
try: |
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.float16, |
|
device_map="auto", |
|
max_memory=max_memory, |
|
low_cpu_mem_usage=True, |
|
trust_remote_code=True |
|
) |
|
except Exception as e: |
|
print(f"First loading attempt failed: {str(e)}") |
|
print("Trying alternative loading method...") |
|
self.model = AutoModelForCausalLM.from_pretrained( |
|
model_id, |
|
torch_dtype=torch.float16, |
|
device_map="balanced", |
|
low_cpu_mem_usage=True, |
|
trust_remote_code=True |
|
) |
|
|
|
if adapter_path: |
|
print(f"Loading adapter from {adapter_path}") |
|
self.model.load_adapter(adapter_path) |
|
|
|
# Generate設定 |
|
self.gen_config = { |
|
"max_new_tokens": max_tokens, |
|
"temperature": temp, |
|
"top_p": top_p, |
|
"do_sample": temp > 0, |
|
"pad_token_id": self.tokenizer.pad_token_id, |
|
"eos_token_id": self.tokenizer.eos_token_id |
|
} |
|
|
|
print("Model loaded successfully") |
|
self.device = next(self.model.parameters()).device |
|
print(f"Model is on device: {self.device}") |
|
|
|
@torch.inference_mode() |
|
def generate_response(self, prompt: str) -> str: |
|
"""効率的な応答生成""" |
|
try: |
|
inputs = self.tokenizer(prompt, return_tensors="pt", padding=True) |
|
inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
|
with torch.cuda.amp.autocast(): |
|
outputs = self.model.generate( |
|
**inputs, |
|
**self.gen_config |
|
) |
|
|
|
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
if prompt in response: |
|
response = response[len(prompt):].strip() |
|
|
|
return response |
|
|
|
except Exception as e: |
|
print(f"Error during generation: {str(e)}") |
|
raise |
|
|
|
def load_tasks(self, file_path: str) -> list: |
|
"""ELYZAタスクの読み込み""" |
|
datasets = [] |
|
with open(file_path, "r") as f: |
|
for line in f: |
|
if line.strip(): |
|
datasets.append(json.loads(line.strip())) |
|
return datasets |
|
|
|
def run_inference(self, input_file="elyza-tasks-100-TV_0.jsonl", output_file="gpu_results.jsonl"): |
|
"""バッチ処理による効率的な推論実行""" |
|
tasks = self.load_tasks(input_file) |
|
results = [] |
|
|
|
start_time = time.time() |
|
execution_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
print(f"Execution started at: {execution_date}") |
|
print(f"Total tasks: {len(tasks)}") |
|
print("-" * 50) |
|
|
|
for task in tqdm(tasks, desc="Processing tasks"): |
|
task_start_time = time.time() |
|
|
|
prompt = f"""### Instruction: |
|
{task['input']} |
|
<eos> |
|
### Response: """ |
|
|
|
try: |
|
response = self.generate_response(prompt) |
|
try: |
|
answer = response.split('### Response: ')[-1] |
|
except: |
|
answer = response |
|
|
|
task_end_time = time.time() |
|
task_duration = task_end_time - task_start_time |
|
|
|
result = { |
|
"task_id": task["task_id"], |
|
"input": task["input"], |
|
"output": answer |
|
} |
|
results.append(result) |
|
|
|
print(f"\nTask {task['task_id']} completed in {task_duration:.2f} seconds") |
|
print(f"Input: {task['input'][:100]}...") |
|
print(f"Output: {answer[:100]}...") |
|
print("-" * 50) |
|
|
|
with open(output_file, 'a', encoding='utf-8') as f: |
|
json.dump(result, f, ensure_ascii=False) |
|
f.write('\n') |
|
|
|
if task["task_id"] % 5 == 0: |
|
torch.cuda.empty_cache() |
|
|
|
except Exception as e: |
|
print(f"Error processing task {task['task_id']}: {str(e)}") |
|
continue |
|
|
|
total_time = time.time() - start_time |
|
avg_time = total_time / len(tasks) |
|
|
|
summary = { |
|
"execution_date": execution_date, |
|
"total_tasks": len(tasks), |
|
"total_time": round(total_time, 2), |
|
"average_time_per_task": round(avg_time, 2), |
|
"model_id": self.model_id, |
|
"adapter_used": self.adapter_path is not None |
|
} |
|
|
|
print("\nExecution Summary:") |
|
print(f"Total execution time: {total_time:.2f} seconds") |
|
print(f"Average time per task: {avg_time:.2f} seconds") |
|
print(f"Results saved to: {output_file}") |
|
|
|
summary_file = output_file.replace('.jsonl', '_summary.json') |
|
with open(summary_file, 'w', encoding='utf-8') as f: |
|
json.dump(summary, f, ensure_ascii=False, indent=2) |
|
|
|
return results |
|
``` |
|
|
|
``` |
|
from GPUPredictions import GPUPredictions |
|
|
|
predictor = GPUPredictions( |
|
model_id="testmoto/gemma-2-lora-dpo-moe-1" |
|
) |
|
results = predictor.run_inference( |
|
input_file="elyza-tasks-100-TV_0.jsonl", |
|
output_file="llm_2024_elyza_tv_0.jsonl" |
|
) |
|
``` |
|
|