Update README.md

6152eb7 verified 12 days ago

7.37 kB

	---
	base_model: google/gemma-2-9b
	tags:
	- text-generation-inference
	- transformers
	- torch
	- mlx_lm

	license: apache-2.0
	language:
	- en
	---

	# 推論用コード
	Hugging Faceにアップロードしたモデルを用いてELYZA-tasks-100-TVの出力を得るためのコードです。
	このコードで生成されたjsonlファイルは課題の成果として提出可能なフォーマットになっております。

	```
	!pip install -U bitsandbytes
	!pip install -U transformers
	!pip install -U accelerate
	!pip install -U datasets
	!pip install -U peft
	```

	```
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import json
	from pathlib import Path
	from typing import Dict, Any, Optional
	from tqdm import tqdm
	import time
	from datetime import datetime

	class GPUPredictions:
	def __init__(self,
	model_id="testmoto/gemma-2-lora-dpo-moe-1",
	adapter_path=None,
	max_tokens=1024,
	temp=0.0,
	top_p=0.9,
	seed=3407):
	self.model_id = model_id
	self.adapter_path = adapter_path
	self.max_tokens = max_tokens
	self.temp = temp
	self.top_p = top_p
	self.seed = seed

	print(f"Loading model: {model_id}")
	torch.cuda.empty_cache()

	# GPU設定
	n_gpus = torch.cuda.device_count()
	max_memory = {i: "20GiB" for i in range(n_gpus)}
	max_memory["cpu"] = "100GiB"

	# トークナイザーの初期化
	self.tokenizer = AutoTokenizer.from_pretrained(
	model_id,
	trust_remote_code=True
	)
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	try:
	self.model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto",
	max_memory=max_memory,
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)
	except Exception as e:
	print(f"First loading attempt failed: {str(e)}")
	print("Trying alternative loading method...")
	self.model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="balanced",
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	if adapter_path:
	print(f"Loading adapter from {adapter_path}")
	self.model.load_adapter(adapter_path)

	# Generate設定
	self.gen_config = {
	"max_new_tokens": max_tokens,
	"temperature": temp,
	"top_p": top_p,
	"do_sample": temp > 0,
	"pad_token_id": self.tokenizer.pad_token_id,
	"eos_token_id": self.tokenizer.eos_token_id
	}

	print("Model loaded successfully")
	self.device = next(self.model.parameters()).device
	print(f"Model is on device: {self.device}")

	@torch.inference_mode()
	def generate_response(self, prompt: str) -> str:
	"""効率的な応答生成"""
	try:
	inputs = self.tokenizer(prompt, return_tensors="pt", padding=True)
	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	with torch.cuda.amp.autocast():
	outputs = self.model.generate(
	**inputs,
	**self.gen_config
	)

	response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	if prompt in response:
	response = response[len(prompt):].strip()

	return response

	except Exception as e:
	print(f"Error during generation: {str(e)}")
	raise

	def load_tasks(self, file_path: str) -> list:
	"""ELYZAタスクの読み込み"""
	datasets = []
	with open(file_path, "r") as f:
	for line in f:
	if line.strip():
	datasets.append(json.loads(line.strip()))
	return datasets

	def run_inference(self, input_file="elyza-tasks-100-TV_0.jsonl", output_file="gpu_results.jsonl"):
	"""バッチ処理による効率的な推論実行"""
	tasks = self.load_tasks(input_file)
	results = []

	start_time = time.time()
	execution_date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	print(f"Execution started at: {execution_date}")
	print(f"Total tasks: {len(tasks)}")
	print("-" * 50)

	for task in tqdm(tasks, desc="Processing tasks"):
	task_start_time = time.time()

	prompt = f"""### Instruction:
	{task['input']}
	<eos>
	### Response: """

	try:
	response = self.generate_response(prompt)
	try:
	answer = response.split('### Response: ')[-1]
	except:
	answer = response

	task_end_time = time.time()
	task_duration = task_end_time - task_start_time

	result = {
	"task_id": task["task_id"],
	"input": task["input"],
	"output": answer
	}
	results.append(result)

	print(f"\nTask {task['task_id']} completed in {task_duration:.2f} seconds")
	print(f"Input: {task['input'][:100]}...")
	print(f"Output: {answer[:100]}...")
	print("-" * 50)

	with open(output_file, 'a', encoding='utf-8') as f:
	json.dump(result, f, ensure_ascii=False)
	f.write('\n')

	if task["task_id"] % 5 == 0:
	torch.cuda.empty_cache()

	except Exception as e:
	print(f"Error processing task {task['task_id']}: {str(e)}")
	continue

	total_time = time.time() - start_time
	avg_time = total_time / len(tasks)

	summary = {
	"execution_date": execution_date,
	"total_tasks": len(tasks),
	"total_time": round(total_time, 2),
	"average_time_per_task": round(avg_time, 2),
	"model_id": self.model_id,
	"adapter_used": self.adapter_path is not None
	}

	print("\nExecution Summary:")
	print(f"Total execution time: {total_time:.2f} seconds")
	print(f"Average time per task: {avg_time:.2f} seconds")
	print(f"Results saved to: {output_file}")

	summary_file = output_file.replace('.jsonl', '_summary.json')
	with open(summary_file, 'w', encoding='utf-8') as f:
	json.dump(summary, f, ensure_ascii=False, indent=2)

	return results
	```

	```
	from GPUPredictions import GPUPredictions

	predictor = GPUPredictions(
	model_id="testmoto/gemma-2-lora-dpo-moe-1"
	)
	results = predictor.run_inference(
	input_file="elyza-tasks-100-TV_0.jsonl",
	output_file="llm_2024_elyza_tv_0.jsonl"
	)
	```