alphrc
/

Meta-Llama-3.1-405B-4bits

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

Meta-Llama-3.1-405B-4bits / src /run-exo.py

alphrc's picture

Upload folder using huggingface_hub

2058c5f verified 3 months ago

3.41 kB

	# In this example, a user is running a home cluster with 3 shards.
	# They are prompting the cluster to generate a response to a question.
	# The cluster is given the question, and the user is given the response.

	from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
	from exo.inference.shard import Shard
	from exo.networking.peer_handle import PeerHandle
	from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
	from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
	from typing import List
	import asyncio
	import argparse
	import uuid

	models = {
	"mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
	"mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
	""
	}

	path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
	model_path = get_model_path(path_or_hf_repo)
	tokenizer_config = {}
	tokenizer = load_tokenizer(model_path, tokenizer_config)

	# we intentionally leave out peer1 to demonstrate equality of nodes in exo.
	# there is no "master" node in exo, all nodes are equal and can take on any role.
	# peer1 = GRPCPeerHandle(
	# "node1",
	# "localhost:8080",
	# DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
	# )
	peer2 = GRPCPeerHandle(
	"node2",
	"localhost:8081",
	DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0))
	)
	shard = models[path_or_hf_repo]
	request_id = str(uuid.uuid4())

	async def run_prompt(prompt: str):
	if tokenizer.chat_template is None:
	tokenizer.chat_template = tokenizer.default_chat_template
	if (
	hasattr(tokenizer, "apply_chat_template")
	and tokenizer.chat_template is not None
	):
	messages = [{"role": "user", "content": prompt}]
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	await peer2.connect()
	await peer2.global_reset(shard, set(), 2)

	try:
	await peer2.send_prompt(shard, prompt, request_id)
	except Exception as e:
	print(e)

	import time
	# poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
	previous_length = 0
	n_tokens = 0
	start_time = time.perf_counter()
	while True:
	try:
	result, is_finished = await peer2.get_inference_result(request_id)
	except Exception as e:
	continue
	await asyncio.sleep(0.1)

	# Print the updated string in place
	updated_string = tokenizer.decode(result)
	n_tokens = len(result)
	print(updated_string[previous_length:], end='', flush=True)
	previous_length = len(updated_string)

	if is_finished:
	print("\nDone")
	break
	end_time = time.perf_counter()
	print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Run prompt")
	parser.add_argument("--prompt", type=str, help="The prompt to run")
	args = parser.parse_args()

	asyncio.run(run_prompt(args.prompt))