# In this example, a user is running a home cluster with 3 shards. # They are prompting the cluster to generate a response to a question. # The cluster is given the question, and the user is given the response. from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer from exo.inference.shard import Shard from exo.networking.peer_handle import PeerHandle from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops from typing import List import asyncio import argparse import uuid models = { "mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32), "mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80), "" } path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit" model_path = get_model_path(path_or_hf_repo) tokenizer_config = {} tokenizer = load_tokenizer(model_path, tokenizer_config) # we intentionally leave out peer1 to demonstrate equality of nodes in exo. # there is no "master" node in exo, all nodes are equal and can take on any role. # peer1 = GRPCPeerHandle( # "node1", # "localhost:8080", # DeviceCapabilities(model="placeholder", chip="placeholder", memory=0) # ) peer2 = GRPCPeerHandle( "node2", "localhost:8081", DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0)) ) shard = models[path_or_hf_repo] request_id = str(uuid.uuid4()) async def run_prompt(prompt: str): if tokenizer.chat_template is None: tokenizer.chat_template = tokenizer.default_chat_template if ( hasattr(tokenizer, "apply_chat_template") and tokenizer.chat_template is not None ): messages = [{"role": "user", "content": prompt}] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) await peer2.connect() await peer2.global_reset(shard, set(), 2) try: await peer2.send_prompt(shard, prompt, request_id) except Exception as e: print(e) import time # poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user) previous_length = 0 n_tokens = 0 start_time = time.perf_counter() while True: try: result, is_finished = await peer2.get_inference_result(request_id) except Exception as e: continue await asyncio.sleep(0.1) # Print the updated string in place updated_string = tokenizer.decode(result) n_tokens = len(result) print(updated_string[previous_length:], end='', flush=True) previous_length = len(updated_string) if is_finished: print("\nDone") break end_time = time.perf_counter() print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run prompt") parser.add_argument("--prompt", type=str, help="The prompt to run") args = parser.parse_args() asyncio.run(run_prompt(args.prompt))