Delete src
Browse files- src/run-exo.py +0 -90
- src/run-hf.py +0 -4
src/run-exo.py
DELETED
@@ -1,90 +0,0 @@
|
|
1 |
-
# In this example, a user is running a home cluster with 3 shards.
|
2 |
-
# They are prompting the cluster to generate a response to a question.
|
3 |
-
# The cluster is given the question, and the user is given the response.
|
4 |
-
|
5 |
-
from exo.inference.mlx.sharded_utils import get_model_path, load_tokenizer
|
6 |
-
from exo.inference.shard import Shard
|
7 |
-
from exo.networking.peer_handle import PeerHandle
|
8 |
-
from exo.networking.grpc.grpc_peer_handle import GRPCPeerHandle
|
9 |
-
from exo.topology.device_capabilities import DeviceCapabilities, DeviceFlops
|
10 |
-
from typing import List
|
11 |
-
import asyncio
|
12 |
-
import argparse
|
13 |
-
import uuid
|
14 |
-
|
15 |
-
models = {
|
16 |
-
"mlx-community/Meta-Llama-3-8B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-8B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=32),
|
17 |
-
"mlx-community/Meta-Llama-3-70B-Instruct-4bit": Shard(model_id="mlx-community/Meta-Llama-3-70B-Instruct-4bit", start_layer=0, end_layer=0, n_layers=80),
|
18 |
-
""
|
19 |
-
}
|
20 |
-
|
21 |
-
path_or_hf_repo = "mlx-community/Meta-Llama-3-8B-Instruct-4bit"
|
22 |
-
model_path = get_model_path(path_or_hf_repo)
|
23 |
-
tokenizer_config = {}
|
24 |
-
tokenizer = load_tokenizer(model_path, tokenizer_config)
|
25 |
-
|
26 |
-
# we intentionally leave out peer1 to demonstrate equality of nodes in exo.
|
27 |
-
# there is no "master" node in exo, all nodes are equal and can take on any role.
|
28 |
-
# peer1 = GRPCPeerHandle(
|
29 |
-
# "node1",
|
30 |
-
# "localhost:8080",
|
31 |
-
# DeviceCapabilities(model="placeholder", chip="placeholder", memory=0)
|
32 |
-
# )
|
33 |
-
peer2 = GRPCPeerHandle(
|
34 |
-
"node2",
|
35 |
-
"localhost:8081",
|
36 |
-
DeviceCapabilities(model="placeholder", chip="placeholder", memory=0, flops=DeviceFlops(fp32=0, fp16=0, int8=0))
|
37 |
-
)
|
38 |
-
shard = models[path_or_hf_repo]
|
39 |
-
request_id = str(uuid.uuid4())
|
40 |
-
|
41 |
-
async def run_prompt(prompt: str):
|
42 |
-
if tokenizer.chat_template is None:
|
43 |
-
tokenizer.chat_template = tokenizer.default_chat_template
|
44 |
-
if (
|
45 |
-
hasattr(tokenizer, "apply_chat_template")
|
46 |
-
and tokenizer.chat_template is not None
|
47 |
-
):
|
48 |
-
messages = [{"role": "user", "content": prompt}]
|
49 |
-
prompt = tokenizer.apply_chat_template(
|
50 |
-
messages, tokenize=False, add_generation_prompt=True
|
51 |
-
)
|
52 |
-
|
53 |
-
await peer2.connect()
|
54 |
-
await peer2.global_reset(shard, set(), 2)
|
55 |
-
|
56 |
-
try:
|
57 |
-
await peer2.send_prompt(shard, prompt, request_id)
|
58 |
-
except Exception as e:
|
59 |
-
print(e)
|
60 |
-
|
61 |
-
import time
|
62 |
-
# poll 10 times per second for result (even though generation is faster, any more than this it's not nice for the user)
|
63 |
-
previous_length = 0
|
64 |
-
n_tokens = 0
|
65 |
-
start_time = time.perf_counter()
|
66 |
-
while True:
|
67 |
-
try:
|
68 |
-
result, is_finished = await peer2.get_inference_result(request_id)
|
69 |
-
except Exception as e:
|
70 |
-
continue
|
71 |
-
await asyncio.sleep(0.1)
|
72 |
-
|
73 |
-
# Print the updated string in place
|
74 |
-
updated_string = tokenizer.decode(result)
|
75 |
-
n_tokens = len(result)
|
76 |
-
print(updated_string[previous_length:], end='', flush=True)
|
77 |
-
previous_length = len(updated_string)
|
78 |
-
|
79 |
-
if is_finished:
|
80 |
-
print("\nDone")
|
81 |
-
break
|
82 |
-
end_time = time.perf_counter()
|
83 |
-
print(f"\nDone. Processed {n_tokens} tokens in {end_time - start_time:.2f} seconds ({n_tokens / (end_time - start_time):.2f} tokens/second)")
|
84 |
-
|
85 |
-
if __name__ == "__main__":
|
86 |
-
parser = argparse.ArgumentParser(description="Run prompt")
|
87 |
-
parser.add_argument("--prompt", type=str, help="The prompt to run")
|
88 |
-
args = parser.parse_args()
|
89 |
-
|
90 |
-
asyncio.run(run_prompt(args.prompt))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/run-hf.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
2 |
-
|
3 |
-
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
|
4 |
-
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-405B")
|
|
|
|
|
|
|
|
|
|